File size: 3,391 Bytes
b5739f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
import pandas as pd
import numpy as np
import faiss, pickle, os
from app.utils import GemmaEmbeddings

def build_index(
    csv_path="data/movies.csv",
    out_dir="faiss_index",
    batch_size=32,
    checkpoint_size=1000
):
    df = pd.read_csv(csv_path)
    texts = df["overview"].fillna("").tolist()
    total = len(texts)

    os.makedirs(out_dir, exist_ok=True)
    embedder = GemmaEmbeddings()

    embeddings = []
    start_idx = 0

    # πŸ”Ή Check for existing partial progress
    checkpoint_file = f"{out_dir}/progress.pkl"
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "rb") as f:
            saved = pickle.load(f)
            embeddings = saved["embeddings"]
            start_idx = saved["next_idx"]
        print(f"πŸ”„ Resuming from index {start_idx}")

    # πŸ”Ή Process in batches
    for i in range(start_idx, total, batch_size):
        batch = texts[i:i+batch_size]
        vectors = embedder.embed(batch)
        embeddings.extend(vectors)
        print(f"βœ… Processed {i+len(batch)} / {total}")

        # Save checkpoint every `checkpoint_size`
        if (i + batch_size) % (10*batch_size) == 0 or (i + batch_size) >= total:
            with open(checkpoint_file, "wb") as f:
                pickle.dump({
                    "embeddings": embeddings,
                    "next_idx": i + batch_size
                }, f)
            print(f"πŸ’Ύ Saved checkpoint at {i+batch_size}")

    # πŸ”Ή Build FAISS index at the end
    embeddings = np.array(embeddings).astype("float32")
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, f"{out_dir}/movies_index.faiss")
    with open(f"{out_dir}/movies.pkl", "wb") as f:
        pickle.dump(df.to_dict(orient="records"), f)

    # Remove checkpoint after success
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
    print("πŸŽ‰ Index built successfully!")

if __name__ == "__main__":
    build_index()
'''


import os
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

def build_faiss(csv_path="data/movies.csv", out_dir="faiss_index"):
    df = pd.read_csv(csv_path).fillna("")

    texts, metadatas = [], []
    for _, row in df.iterrows():
        text = (
            f"Title: {row['title']}.\n"
            f"Overview: {row['overview']}.\n"
            f"Genres: {row['genres']}.\n"
            f"Director: {row['director']}.\n"
            f"Cast: {row['cast']}."
        )
        texts.append(text)
        metadatas.append({
            "id": row["id"],
            "title": row["title"],
            "genres": row["genres"],
            "overview": row["overview"],
            "director": row["director"],
            "cast": row["cast"],
            "release_date": row["release_date"],
            "vote_average": row["vote_average"],
            "popularity": row["popularity"]
        })

    # βœ… Use local MiniLM embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    os.makedirs(out_dir, exist_ok=True)
    db.save_local(out_dir)
    print(f"βœ… Saved FAISS index with {len(df)} movies to {out_dir}")

if __name__ == "__main__":
    build_faiss("data/movies.csv")