Spaces:

Wasifjafri
/

research-rag-chatbot-old

Sleeping

Wasifjafri commited on Sep 10

Commit

a7e7d28

1 Parent(s): 42281ed

fix error

Files changed (4) hide show

requirements.txt CHANGED Viewed

@@ -1,15 +1,48 @@
-streamlit==1.38.0
-langchain==0.2.14
-langchain-community==0.2.12
-langchain-core==0.2.33
-langchain-text-splitters==0.2.2
-langchain-groq
-sentence-transformers==3.0.1
-faiss-cpu>=1.7.4
-transformers>=4.40.0
-huggingface-hub>=0.23.0
-python-dotenv==1.0.1
-requests==2.32.3
-numpy<2.0.0
-dotenv
-# Removed tiktoken (unused) to avoid rust build on HF base image.

+absl-py==2.3.1
+accelerate==1.10.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.10.0
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.3.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-lru==2.0.5
+async-timeout==4.0.3
+attrs==25.3.0
+# Minimal requirements for the RAG chatbot
+# Pin only where needed; compatible with Python 3.11 on Windows
+# Core app
+streamlit==1.49.1
+python-dotenv==1.1.1
+# LangChain stack (aligned versions)
+langchain==0.3.27
+langchain-core==0.3.75
+langchain-community==0.3.29
+langchain-text-splitters==0.3.11
+langchain-groq==0.3.8
+langchain-huggingface==0.3.1
+# Vector store and NLP
+faiss-cpu==1.12.0
+sentence-transformers==5.1.0
+transformers==4.56.1
+# Data + utils
+pandas==2.3.2
+numpy==1.26.4
+requests==2.32.5
+# Optional semantic splitter (app gracefully falls back if missing)
+semantic-text-splitter==0.27.0
+# Dataset fetcher
+kagglehub==0.3.13

src/embeddings.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from .config import EMBEDDING_MODEL, DEVICE
 def get_embedding_model():

+from langchain_huggingface import HuggingFaceEmbeddings
 from .config import EMBEDDING_MODEL, DEVICE
 def get_embedding_model():

src/vector_store.py CHANGED Viewed

@@ -62,13 +62,22 @@ def build_or_load_vectorstore(
 ):
     if os.path.exists(FAISS_INDEX_PATH) and not force_rebuild:
         print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
-        vectorstore = FAISS.load_local(
-            FAISS_INDEX_PATH,
-            get_embedding_model(),
-            allow_dangerous_deserialization=True
-        )
-        print("Vector store loaded successfully.")
-        return vectorstore
     print("Building FAISS index (force_rebuild=%s, method=%s)..." % (force_rebuild, chunk_method))
     splits = _chunk_documents(

 ):
     if os.path.exists(FAISS_INDEX_PATH) and not force_rebuild:
         print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
+        try:
+            vectorstore = FAISS.load_local(
+                FAISS_INDEX_PATH,
+                get_embedding_model(),
+                allow_dangerous_deserialization=True
+            )
+            print("Vector store loaded successfully.")
+            return vectorstore
+        except Exception as e:
+            print(f"Failed to load FAISS index due to: {e}")
+            if not documents:
+                raise RuntimeError(
+                    "Existing FAISS index is incompatible with current libraries and no documents were "
+                    "provided to rebuild it. Delete 'faiss_index' and rebuild, or pass documents to rebuild."
+                ) from e
+            print("Rebuilding FAISS index from provided documents...")
     print("Building FAISS index (force_rebuild=%s, method=%s)..." % (force_rebuild, chunk_method))
     splits = _chunk_documents(

streamlit_app.py CHANGED Viewed

@@ -51,7 +51,25 @@ if rebuild or not os.path.exists(FAISS_INDEX_PATH):
             chunk_overlap=120
         )
 else:
-    vectorstore = build_or_load_vectorstore([], force_rebuild=False)
 llm = ChatGroq(
     model="meta-llama/llama-4-maverick-17b-128e-instruct",

             chunk_overlap=120
         )
 else:
+    try:
+        vectorstore = build_or_load_vectorstore([], force_rebuild=False)
+    except Exception as e:
+        st.warning(f"Failed to load existing index: {e}. Attempting to rebuild from dataset...")
+        data_file = os.path.join(DATA_PATH, "arxiv-metadata-oai-snapshot.json")
+        if not os.path.exists(data_file):
+            st.error("Dataset missing. Run main pipeline first or click 'Rebuild index'.")
+            st.stop()
+        with st.spinner("Rebuilding vector index after load failure..."):
+            df = load_data_subset(data_file, num_records=50000)
+            df = preprocess_dataframe(df)
+            docs = df_to_documents(df)
+            vectorstore = build_or_load_vectorstore(
+                docs,
+                force_rebuild=True,
+                chunk_method="semantic",
+                chunk_size=800,
+                chunk_overlap=120
+            )
 llm = ChatGroq(
     model="meta-llama/llama-4-maverick-17b-128e-instruct",