Spaces:
Sleeping
Sleeping
| ''' | |
| import pandas as pd | |
| import numpy as np | |
| import faiss, pickle, os | |
| from app.utils import GemmaEmbeddings | |
| def build_index( | |
| csv_path="data/movies.csv", | |
| out_dir="faiss_index", | |
| batch_size=32, | |
| checkpoint_size=1000 | |
| ): | |
| df = pd.read_csv(csv_path) | |
| texts = df["overview"].fillna("").tolist() | |
| total = len(texts) | |
| os.makedirs(out_dir, exist_ok=True) | |
| embedder = GemmaEmbeddings() | |
| embeddings = [] | |
| start_idx = 0 | |
| # πΉ Check for existing partial progress | |
| checkpoint_file = f"{out_dir}/progress.pkl" | |
| if os.path.exists(checkpoint_file): | |
| with open(checkpoint_file, "rb") as f: | |
| saved = pickle.load(f) | |
| embeddings = saved["embeddings"] | |
| start_idx = saved["next_idx"] | |
| print(f"π Resuming from index {start_idx}") | |
| # πΉ Process in batches | |
| for i in range(start_idx, total, batch_size): | |
| batch = texts[i:i+batch_size] | |
| vectors = embedder.embed(batch) | |
| embeddings.extend(vectors) | |
| print(f"β Processed {i+len(batch)} / {total}") | |
| # Save checkpoint every `checkpoint_size` | |
| if (i + batch_size) % (10*batch_size) == 0 or (i + batch_size) >= total: | |
| with open(checkpoint_file, "wb") as f: | |
| pickle.dump({ | |
| "embeddings": embeddings, | |
| "next_idx": i + batch_size | |
| }, f) | |
| print(f"πΎ Saved checkpoint at {i+batch_size}") | |
| # πΉ Build FAISS index at the end | |
| embeddings = np.array(embeddings).astype("float32") | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| faiss.write_index(index, f"{out_dir}/movies_index.faiss") | |
| with open(f"{out_dir}/movies.pkl", "wb") as f: | |
| pickle.dump(df.to_dict(orient="records"), f) | |
| # Remove checkpoint after success | |
| if os.path.exists(checkpoint_file): | |
| os.remove(checkpoint_file) | |
| print("π Index built successfully!") | |
| if __name__ == "__main__": | |
| build_index() | |
| ''' | |
| import os | |
| import pandas as pd | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| def build_faiss(csv_path="data/movies.csv", out_dir="faiss_index"): | |
| df = pd.read_csv(csv_path).fillna("") | |
| texts, metadatas = [], [] | |
| for _, row in df.iterrows(): | |
| text = ( | |
| f"Title: {row['title']}.\n" | |
| f"Overview: {row['overview']}.\n" | |
| f"Genres: {row['genres']}.\n" | |
| f"Director: {row['director']}.\n" | |
| f"Cast: {row['cast']}." | |
| ) | |
| texts.append(text) | |
| metadatas.append({ | |
| "id": row["id"], | |
| "title": row["title"], | |
| "genres": row["genres"], | |
| "overview": row["overview"], | |
| "director": row["director"], | |
| "cast": row["cast"], | |
| "release_date": row["release_date"], | |
| "vote_average": row["vote_average"], | |
| "popularity": row["popularity"] | |
| }) | |
| # β Use local MiniLM embeddings | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| db = FAISS.from_texts(texts, embeddings, metadatas=metadatas) | |
| os.makedirs(out_dir, exist_ok=True) | |
| db.save_local(out_dir) | |
| print(f"β Saved FAISS index with {len(df)} movies to {out_dir}") | |
| if __name__ == "__main__": | |
| build_faiss("data/movies.csv") | |