|
|
import sqlite3 |
|
|
import chromadb |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from pathlib import Path |
|
|
import sys |
|
|
sys.path.append(str(Path(__file__).parent.parent)) |
|
|
from helper import get_similarity_model, sanitize |
|
|
|
|
|
def build_vector_store(): |
|
|
""" |
|
|
Builds a persistent vector store from the data in the SQLite database, |
|
|
embedding information about students, faculty, and courses. |
|
|
""" |
|
|
|
|
|
try: |
|
|
client = chromadb.PersistentClient(path="rag/vector_store") |
|
|
collection = client.get_collection("university_data") |
|
|
count = collection.count() |
|
|
if count > 0: |
|
|
print(f"Vector store already exists with {count} documents. Skipping rebuild.") |
|
|
return |
|
|
except: |
|
|
|
|
|
pass |
|
|
|
|
|
conn = sqlite3.connect('database/university.db') |
|
|
cursor = conn.cursor() |
|
|
|
|
|
documents = [] |
|
|
print("Creating student docs") |
|
|
|
|
|
student_query = """ |
|
|
SELECT s.name, s.email, s.svnr, GROUP_CONCAT(c.name, ', ') AS courses |
|
|
FROM students s |
|
|
LEFT JOIN enrollments e ON s.id = e.student_id |
|
|
LEFT JOIN courses c ON e.course_id = c.id |
|
|
GROUP BY s.id |
|
|
""" |
|
|
for name, email, svnr, courses in cursor.execute(student_query).fetchall(): |
|
|
doc = f""" |
|
|
Student Name: {name} |
|
|
Email: {email} |
|
|
SVNR: {svnr} |
|
|
Enrolled Courses: {courses if courses else 'None'} |
|
|
""" |
|
|
documents.append(sanitize(doc.strip())) |
|
|
|
|
|
print(documents[-1]) |
|
|
|
|
|
print("Creating faculty docs") |
|
|
|
|
|
faculty_query = """ |
|
|
SELECT f.name, f.email, f.department, GROUP_CONCAT(c.name, ', ') AS courses |
|
|
FROM faculty f |
|
|
LEFT JOIN courses c ON f.id = c.faculty_id |
|
|
GROUP BY f.id |
|
|
""" |
|
|
for name, email, department, courses in cursor.execute(faculty_query).fetchall(): |
|
|
doc = f""" |
|
|
Faculty Name: {name} |
|
|
Email: {email} |
|
|
Department: {department} |
|
|
Courses Taught: {courses if courses else 'None'} |
|
|
""" |
|
|
documents.append(sanitize(doc.strip())) |
|
|
|
|
|
print(documents[-1]) |
|
|
|
|
|
print("Creating course docs") |
|
|
|
|
|
course_query = """ |
|
|
SELECT |
|
|
c.name as course_name, |
|
|
f.name AS faculty_name, |
|
|
f.department AS faculty_department, |
|
|
GROUP_CONCAT(s.name, ', ') AS students |
|
|
FROM courses c |
|
|
LEFT JOIN faculty f ON c.faculty_id = f.id |
|
|
LEFT JOIN enrollments e ON c.id = e.course_id |
|
|
LEFT JOIN students s ON e.student_id = s.id |
|
|
GROUP BY c.id |
|
|
""" |
|
|
|
|
|
for course_name, faculty_name, faculty_department, students in cursor.execute(course_query).fetchall(): |
|
|
doc = f""" |
|
|
Course Name: {course_name} |
|
|
Taught by: {faculty_name if faculty_name else 'TBD'} |
|
|
Enrolled Students: {students if students else 'None'} |
|
|
Department: {faculty_department if faculty_department else "Unkown"} |
|
|
""" |
|
|
documents.append(doc.strip()) |
|
|
|
|
|
|
|
|
print(documents[-1]) |
|
|
|
|
|
conn.close() |
|
|
|
|
|
|
|
|
model = get_similarity_model() |
|
|
embeddings = model.encode(documents) |
|
|
|
|
|
client = chromadb.PersistentClient(path="rag/vector_store") |
|
|
collection = client.get_or_create_collection("university_data") |
|
|
|
|
|
|
|
|
batch_size = 5000 |
|
|
for i in range(0, len(documents), batch_size): |
|
|
end_idx = min(i + batch_size, len(documents)) |
|
|
batch_embeddings = embeddings[i:end_idx] |
|
|
batch_documents = documents[i:end_idx] |
|
|
batch_ids = [str(j) for j in range(i, end_idx)] |
|
|
|
|
|
collection.add( |
|
|
embeddings=batch_embeddings, |
|
|
documents=batch_documents, |
|
|
ids=batch_ids |
|
|
) |
|
|
|
|
|
print("Vector store built successfully.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
build_vector_store() |
|
|
|