RAG / rag /build_vector_store.py
gauthy08's picture
Add comprehensive experimental dashboard and RAG testing suite
8ce2739
import sqlite3
import chromadb
from sentence_transformers import SentenceTransformer
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from helper import get_similarity_model, sanitize
def build_vector_store():
"""
Builds a persistent vector store from the data in the SQLite database,
embedding information about students, faculty, and courses.
"""
# Check if vector store already exists and has data
try:
client = chromadb.PersistentClient(path="rag/vector_store")
collection = client.get_collection("university_data")
count = collection.count()
if count > 0:
print(f"Vector store already exists with {count} documents. Skipping rebuild.")
return
except:
# Collection doesn't exist, create it
pass
conn = sqlite3.connect('database/university.db')
cursor = conn.cursor()
documents = []
print("Creating student docs")
# === Build Student Documents ===
student_query = """
SELECT s.name, s.email, s.svnr, GROUP_CONCAT(c.name, ', ') AS courses
FROM students s
LEFT JOIN enrollments e ON s.id = e.student_id
LEFT JOIN courses c ON e.course_id = c.id
GROUP BY s.id
"""
for name, email, svnr, courses in cursor.execute(student_query).fetchall():
doc = f"""
Student Name: {name}
Email: {email}
SVNR: {svnr}
Enrolled Courses: {courses if courses else 'None'}
"""
documents.append(sanitize(doc.strip()))
print(documents[-1])
print("Creating faculty docs")
# === Build Faculty Documents ===
faculty_query = """
SELECT f.name, f.email, f.department, GROUP_CONCAT(c.name, ', ') AS courses
FROM faculty f
LEFT JOIN courses c ON f.id = c.faculty_id
GROUP BY f.id
"""
for name, email, department, courses in cursor.execute(faculty_query).fetchall():
doc = f"""
Faculty Name: {name}
Email: {email}
Department: {department}
Courses Taught: {courses if courses else 'None'}
"""
documents.append(sanitize(doc.strip()))
print(documents[-1])
print("Creating course docs")
# === Build Course Documents ===
course_query = """
SELECT
c.name as course_name,
f.name AS faculty_name,
f.department AS faculty_department,
GROUP_CONCAT(s.name, ', ') AS students
FROM courses c
LEFT JOIN faculty f ON c.faculty_id = f.id
LEFT JOIN enrollments e ON c.id = e.course_id
LEFT JOIN students s ON e.student_id = s.id
GROUP BY c.id
"""
for course_name, faculty_name, faculty_department, students in cursor.execute(course_query).fetchall():
doc = f"""
Course Name: {course_name}
Taught by: {faculty_name if faculty_name else 'TBD'}
Enrolled Students: {students if students else 'None'}
Department: {faculty_department if faculty_department else "Unkown"}
"""
documents.append(doc.strip())
print(documents[-1])
conn.close()
# === Embed and Store in Vector DB ===
model = get_similarity_model()
embeddings = model.encode(documents)
client = chromadb.PersistentClient(path="rag/vector_store")
collection = client.get_or_create_collection("university_data")
# Add documents in batches to avoid batch size limits
batch_size = 5000 # Safe batch size under the limit
for i in range(0, len(documents), batch_size):
end_idx = min(i + batch_size, len(documents))
batch_embeddings = embeddings[i:end_idx]
batch_documents = documents[i:end_idx]
batch_ids = [str(j) for j in range(i, end_idx)]
collection.add(
embeddings=batch_embeddings,
documents=batch_documents,
ids=batch_ids
)
print("Vector store built successfully.")
if __name__ == "__main__":
build_vector_store()