Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import streamlit as st | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_community.document_loaders import PyPDFLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| logging.basicConfig( | |
| filename="crisis_log.txt", | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| # Suppress ChromaDB telemetry + non-critical logs | |
| logging.getLogger("chromadb").setLevel(logging.ERROR) | |
| KB_PATH = "data/kb/" | |
| CHROMA_PATH = "data/chroma_db" # Persistent Chroma DB | |
| embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY")) | |
| def load_documents(): | |
| """Load and split KB files, with error handling.""" | |
| docs = [] | |
| if not os.path.exists(KB_PATH): | |
| logging.error(f"Knowledge base directory {KB_PATH} does not exist") | |
| st.warning(f"Knowledge base directory {KB_PATH} not found. Using fallback document.") | |
| return [Document(page_content="No knowledge base documents available.", metadata={"source": "fallback"})] | |
| for file in os.listdir(KB_PATH): | |
| file_path = os.path.join(KB_PATH, file) | |
| try: | |
| if not os.path.isfile(file_path): | |
| logging.warning(f"Skipping {file_path}: Not a file") | |
| continue | |
| if file.endswith(".pdf"): | |
| loader = PyPDFLoader(file_path) | |
| file_docs = loader.load() | |
| docs.extend(file_docs) | |
| logging.info(f"Loaded PDF: {file_path} with {len(file_docs)} pages") | |
| elif file.endswith(".txt"): | |
| loader = TextLoader(file_path) | |
| file_docs = loader.load() | |
| docs.extend(file_docs) | |
| logging.info(f"Loaded text file: {file_path} with {len(file_docs)} chunks") | |
| except Exception as e: | |
| logging.error(f"Error loading {file_path}: {str(e)}") | |
| st.warning(f"Failed to load {file_path}. Skipping.") | |
| if not docs: | |
| logging.warning("No documents loaded from knowledge base") | |
| st.warning("No valid documents found in knowledge base. Using fallback document.") | |
| docs = [Document(page_content="No knowledge base documents available.", metadata={"source": "fallback"})] | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| split_docs = splitter.split_documents(docs) | |
| logging.info(f"Split {len(docs)} documents into {len(split_docs)} chunks") | |
| return split_docs | |
| def setup_vectorstore(force_rebuild=False): | |
| """Setup or load Chroma vectorstore.""" | |
| try: | |
| if force_rebuild or not os.path.exists(CHROMA_PATH): | |
| docs = load_documents() | |
| if not docs: | |
| raise ValueError("No documents available for vectorstore creation") | |
| vectorstore = Chroma.from_documents(docs, embeddings, persist_directory=CHROMA_PATH) | |
| vectorstore.persist() | |
| logging.info(f"Created new vectorstore at {CHROMA_PATH} with {len(docs)} documents") | |
| else: | |
| vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings) | |
| logging.info(f"Loaded existing vectorstore from {CHROMA_PATH}") | |
| return vectorstore | |
| except Exception as e: | |
| logging.error(f"Error setting up vectorstore: {str(e)}") | |
| st.error(f"Failed to initialize vectorstore: {str(e)}. App may have limited functionality.") | |
| # Return a dummy vectorstore to prevent app crash | |
| return Chroma.from_texts( | |
| texts=["No knowledge base available"], | |
| embedding=embeddings, | |
| persist_directory=CHROMA_PATH | |
| ) | |
| def retrieve_context(query, k=3): | |
| """Retrieve relevant chunks from KB.""" | |
| try: | |
| retriever = st.session_state.vectorstore.as_retriever(search_kwargs={"k": k}) | |
| docs = retriever.get_relevant_documents(query) | |
| return docs | |
| except Exception as e: | |
| logging.error(f"Error retrieving context: {str(e)}") | |
| st.warning(f"Failed to retrieve context: {str(e)}. Using fallback response.") | |
| return [Document(page_content="Unable to retrieve context.", metadata={"source": "error"})] |