Spaces:

pxdelta
/

RAG

Sleeping

App Files Files Community

RAG / rag /build_vector_store.py

gauthy08

Add comprehensive experimental dashboard and RAG testing suite

8ce2739 about 1 month ago

raw

history blame contribute delete

4.04 kB

	import sqlite3
	import chromadb
	from sentence_transformers import SentenceTransformer
	from pathlib import Path
	import sys
	sys.path.append(str(Path(__file__).parent.parent))
	from helper import get_similarity_model, sanitize

	def build_vector_store():
	"""
	Builds a persistent vector store from the data in the SQLite database,
	embedding information about students, faculty, and courses.
	"""
	# Check if vector store already exists and has data
	try:
	client = chromadb.PersistentClient(path="rag/vector_store")
	collection = client.get_collection("university_data")
	count = collection.count()
	if count > 0:
	print(f"Vector store already exists with {count} documents. Skipping rebuild.")
	return
	except:
	# Collection doesn't exist, create it
	pass

	conn = sqlite3.connect('database/university.db')
	cursor = conn.cursor()

	documents = []
	print("Creating student docs")
	# === Build Student Documents ===
	student_query = """
	SELECT s.name, s.email, s.svnr, GROUP_CONCAT(c.name, ', ') AS courses
	FROM students s
	LEFT JOIN enrollments e ON s.id = e.student_id
	LEFT JOIN courses c ON e.course_id = c.id
	GROUP BY s.id
	"""
	for name, email, svnr, courses in cursor.execute(student_query).fetchall():
	doc = f"""
	Student Name: {name}
	Email: {email}
	SVNR: {svnr}
	Enrolled Courses: {courses if courses else 'None'}
	"""
	documents.append(sanitize(doc.strip()))

	print(documents[-1])

	print("Creating faculty docs")
	# === Build Faculty Documents ===
	faculty_query = """
	SELECT f.name, f.email, f.department, GROUP_CONCAT(c.name, ', ') AS courses
	FROM faculty f
	LEFT JOIN courses c ON f.id = c.faculty_id
	GROUP BY f.id
	"""
	for name, email, department, courses in cursor.execute(faculty_query).fetchall():
	doc = f"""
	Faculty Name: {name}
	Email: {email}
	Department: {department}
	Courses Taught: {courses if courses else 'None'}
	"""
	documents.append(sanitize(doc.strip()))

	print(documents[-1])

	print("Creating course docs")
	# === Build Course Documents ===
	course_query = """
	SELECT
	c.name as course_name,
	f.name AS faculty_name,
	f.department AS faculty_department,
	GROUP_CONCAT(s.name, ', ') AS students
	FROM courses c
	LEFT JOIN faculty f ON c.faculty_id = f.id
	LEFT JOIN enrollments e ON c.id = e.course_id
	LEFT JOIN students s ON e.student_id = s.id
	GROUP BY c.id
	"""

	for course_name, faculty_name, faculty_department, students in cursor.execute(course_query).fetchall():
	doc = f"""
	Course Name: {course_name}
	Taught by: {faculty_name if faculty_name else 'TBD'}
	Enrolled Students: {students if students else 'None'}
	Department: {faculty_department if faculty_department else "Unkown"}
	"""
	documents.append(doc.strip())


	print(documents[-1])

	conn.close()

	# === Embed and Store in Vector DB ===
	model = get_similarity_model()
	embeddings = model.encode(documents)

	client = chromadb.PersistentClient(path="rag/vector_store")
	collection = client.get_or_create_collection("university_data")

	# Add documents in batches to avoid batch size limits
	batch_size = 5000 # Safe batch size under the limit
	for i in range(0, len(documents), batch_size):
	end_idx = min(i + batch_size, len(documents))
	batch_embeddings = embeddings[i:end_idx]
	batch_documents = documents[i:end_idx]
	batch_ids = [str(j) for j in range(i, end_idx)]

	collection.add(
	embeddings=batch_embeddings,
	documents=batch_documents,
	ids=batch_ids
	)

	print("Vector store built successfully.")


	if __name__ == "__main__":
	build_vector_store()