Wasifjafri commited on
Commit
a7e7d28
·
1 Parent(s): 42281ed
Files changed (4) hide show
  1. requirements.txt +48 -15
  2. src/embeddings.py +1 -1
  3. src/vector_store.py +16 -7
  4. streamlit_app.py +19 -1
requirements.txt CHANGED
@@ -1,15 +1,48 @@
1
- streamlit==1.38.0
2
- langchain==0.2.14
3
- langchain-community==0.2.12
4
- langchain-core==0.2.33
5
- langchain-text-splitters==0.2.2
6
- langchain-groq
7
- sentence-transformers==3.0.1
8
- faiss-cpu>=1.7.4
9
- transformers>=4.40.0
10
- huggingface-hub>=0.23.0
11
- python-dotenv==1.0.1
12
- requests==2.32.3
13
- numpy<2.0.0
14
- dotenv
15
- # Removed tiktoken (unused) to avoid rust build on HF base image.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.1
2
+ accelerate==1.10.1
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.15
5
+ aiosignal==1.4.0
6
+ altair==5.5.0
7
+ annotated-types==0.7.0
8
+ anyio==4.10.0
9
+ argon2-cffi==25.1.0
10
+ argon2-cffi-bindings==25.1.0
11
+ arrow==1.3.0
12
+ asttokens==3.0.0
13
+ astunparse==1.6.3
14
+ async-lru==2.0.5
15
+ async-timeout==4.0.3
16
+ attrs==25.3.0
17
+ # Minimal requirements for the RAG chatbot
18
+ # Pin only where needed; compatible with Python 3.11 on Windows
19
+
20
+ # Core app
21
+ streamlit==1.49.1
22
+ python-dotenv==1.1.1
23
+
24
+ # LangChain stack (aligned versions)
25
+ langchain==0.3.27
26
+ langchain-core==0.3.75
27
+ langchain-community==0.3.29
28
+ langchain-text-splitters==0.3.11
29
+ langchain-groq==0.3.8
30
+ langchain-huggingface==0.3.1
31
+
32
+ # Vector store and NLP
33
+ faiss-cpu==1.12.0
34
+ sentence-transformers==5.1.0
35
+ transformers==4.56.1
36
+
37
+ # Data + utils
38
+ pandas==2.3.2
39
+ numpy==1.26.4
40
+ requests==2.32.5
41
+
42
+ # Optional semantic splitter (app gracefully falls back if missing)
43
+ semantic-text-splitter==0.27.0
44
+
45
+ # Dataset fetcher
46
+ kagglehub==0.3.13
47
+
48
+
src/embeddings.py CHANGED
@@ -1,4 +1,4 @@
1
- from langchain_community.embeddings import HuggingFaceEmbeddings
2
  from .config import EMBEDDING_MODEL, DEVICE
3
 
4
  def get_embedding_model():
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
  from .config import EMBEDDING_MODEL, DEVICE
3
 
4
  def get_embedding_model():
src/vector_store.py CHANGED
@@ -62,13 +62,22 @@ def build_or_load_vectorstore(
62
  ):
63
  if os.path.exists(FAISS_INDEX_PATH) and not force_rebuild:
64
  print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
65
- vectorstore = FAISS.load_local(
66
- FAISS_INDEX_PATH,
67
- get_embedding_model(),
68
- allow_dangerous_deserialization=True
69
- )
70
- print("Vector store loaded successfully.")
71
- return vectorstore
 
 
 
 
 
 
 
 
 
72
 
73
  print("Building FAISS index (force_rebuild=%s, method=%s)..." % (force_rebuild, chunk_method))
74
  splits = _chunk_documents(
 
62
  ):
63
  if os.path.exists(FAISS_INDEX_PATH) and not force_rebuild:
64
  print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
65
+ try:
66
+ vectorstore = FAISS.load_local(
67
+ FAISS_INDEX_PATH,
68
+ get_embedding_model(),
69
+ allow_dangerous_deserialization=True
70
+ )
71
+ print("Vector store loaded successfully.")
72
+ return vectorstore
73
+ except Exception as e:
74
+ print(f"Failed to load FAISS index due to: {e}")
75
+ if not documents:
76
+ raise RuntimeError(
77
+ "Existing FAISS index is incompatible with current libraries and no documents were "
78
+ "provided to rebuild it. Delete 'faiss_index' and rebuild, or pass documents to rebuild."
79
+ ) from e
80
+ print("Rebuilding FAISS index from provided documents...")
81
 
82
  print("Building FAISS index (force_rebuild=%s, method=%s)..." % (force_rebuild, chunk_method))
83
  splits = _chunk_documents(
streamlit_app.py CHANGED
@@ -51,7 +51,25 @@ if rebuild or not os.path.exists(FAISS_INDEX_PATH):
51
  chunk_overlap=120
52
  )
53
  else:
54
- vectorstore = build_or_load_vectorstore([], force_rebuild=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  llm = ChatGroq(
57
  model="meta-llama/llama-4-maverick-17b-128e-instruct",
 
51
  chunk_overlap=120
52
  )
53
  else:
54
+ try:
55
+ vectorstore = build_or_load_vectorstore([], force_rebuild=False)
56
+ except Exception as e:
57
+ st.warning(f"Failed to load existing index: {e}. Attempting to rebuild from dataset...")
58
+ data_file = os.path.join(DATA_PATH, "arxiv-metadata-oai-snapshot.json")
59
+ if not os.path.exists(data_file):
60
+ st.error("Dataset missing. Run main pipeline first or click 'Rebuild index'.")
61
+ st.stop()
62
+ with st.spinner("Rebuilding vector index after load failure..."):
63
+ df = load_data_subset(data_file, num_records=50000)
64
+ df = preprocess_dataframe(df)
65
+ docs = df_to_documents(df)
66
+ vectorstore = build_or_load_vectorstore(
67
+ docs,
68
+ force_rebuild=True,
69
+ chunk_method="semantic",
70
+ chunk_size=800,
71
+ chunk_overlap=120
72
+ )
73
 
74
  llm = ChatGroq(
75
  model="meta-llama/llama-4-maverick-17b-128e-instruct",