Spaces:
Paused
Paused
| """ | |
| Vanilla vector search using FAISS index and OpenAI embeddings. | |
| """ | |
| import numpy as np | |
| import faiss | |
| from typing import Tuple, List, Optional | |
| from openai import OpenAI | |
| import pickle | |
| import logging | |
| from config import * | |
| from utils import EmbeddingGenerator, classify_image | |
| logger = logging.getLogger(__name__) | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=OPENAI_API_KEY) | |
| # Global variables for lazy loading | |
| _index = None | |
| _texts = None | |
| _metadata = None | |
| def _load_vanilla_index(): | |
| """Lazy load vanilla FAISS index and metadata.""" | |
| global _index, _texts, _metadata | |
| if _index is None: | |
| try: | |
| if VANILLA_FAISS_INDEX.exists() and VANILLA_METADATA.exists(): | |
| logger.info("Loading vanilla FAISS index...") | |
| # Load FAISS index | |
| _index = faiss.read_index(str(VANILLA_FAISS_INDEX)) | |
| # Load metadata | |
| with open(VANILLA_METADATA, 'rb') as f: | |
| data = pickle.load(f) | |
| if isinstance(data, list): | |
| # New format with metadata list | |
| _texts = [item['text'] for item in data] | |
| _metadata = [item['metadata'] for item in data] | |
| else: | |
| # Old format with dict | |
| _texts = data.get('texts', []) | |
| _metadata = data.get('metadata', []) | |
| logger.info(f"✓ Loaded vanilla index with {len(_texts)} documents") | |
| else: | |
| logger.warning("Vanilla index not found. Run preprocess.py first.") | |
| _index = None | |
| _texts = [] | |
| _metadata = [] | |
| except Exception as e: | |
| logger.error(f"Error loading vanilla index: {e}") | |
| _index = None | |
| _texts = [] | |
| _metadata = [] | |
| def query(question: str, image_path: Optional[str] = None, top_k: int = None) -> Tuple[str, List[dict]]: | |
| """ | |
| Query using vanilla vector search. | |
| Args: | |
| question: User's question | |
| image_path: Optional path to an image (for multimodal queries) | |
| top_k: Number of relevant chunks to retrieve | |
| Returns: | |
| Tuple of (answer, citations) | |
| """ | |
| if top_k is None: | |
| top_k = DEFAULT_TOP_K | |
| # Load index if not already loaded | |
| _load_vanilla_index() | |
| if _index is None or len(_texts) == 0: | |
| return "Index not loaded. Please run preprocess.py first.", [] | |
| # Generate query embedding using embedding generator | |
| embedding_gen = EmbeddingGenerator() | |
| query_embedding = embedding_gen.embed_text_openai([question]) | |
| # Normalize for cosine similarity | |
| query_embedding = query_embedding.astype(np.float32) | |
| faiss.normalize_L2(query_embedding) | |
| # Search the index | |
| distances, indices = _index.search(query_embedding, top_k) | |
| # Collect retrieved chunks and citations | |
| retrieved_chunks = [] | |
| citations = [] | |
| sources_seen = set() | |
| for idx, distance in zip(indices[0], distances[0]): | |
| if idx < len(_texts) and distance > MIN_RELEVANCE_SCORE: | |
| chunk_text = _texts[idx] | |
| chunk_meta = _metadata[idx] | |
| retrieved_chunks.append({ | |
| 'text': chunk_text, | |
| 'score': float(distance), | |
| 'metadata': chunk_meta | |
| }) | |
| # Build citation | |
| if chunk_meta['source'] not in sources_seen: | |
| citation = { | |
| 'source': chunk_meta['source'], | |
| 'type': chunk_meta['type'], | |
| 'relevance_score': round(float(distance), 3) | |
| } | |
| if chunk_meta['type'] == 'pdf': | |
| citation['path'] = chunk_meta['path'] | |
| else: # HTML | |
| citation['url'] = chunk_meta.get('url', '') | |
| citations.append(citation) | |
| sources_seen.add(chunk_meta['source']) | |
| # Handle image if provided | |
| image_context = "" | |
| if image_path: | |
| try: | |
| classification = classify_image(image_path) | |
| image_context = f"\n\n[Image Context: The provided image appears to be a {classification}.]" | |
| except Exception as e: | |
| logger.error(f"Error processing image: {e}") | |
| # Build context for the prompt | |
| context = "\n\n---\n\n".join([chunk['text'] for chunk in retrieved_chunks]) | |
| if not context: | |
| return "No relevant documents found for your query.", [] | |
| # Generate answer using OpenAI | |
| prompt = f"""Use the following context to answer the question: | |
| {context}{image_context} | |
| Question: {question} | |
| Please provide a comprehensive answer based on the context provided. If the context doesn't contain enough information, say so.""" | |
| # For GPT-5, temperature must be default (1.0) | |
| response = client.chat.completions.create( | |
| model=OPENAI_CHAT_MODEL, | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety. Always cite your sources when providing information."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_completion_tokens=DEFAULT_MAX_TOKENS | |
| ) | |
| answer = response.choices[0].message.content | |
| return answer, citations | |
| def query_with_feedback(question: str, feedback_scores: List[float] = None, top_k: int = 5) -> Tuple[str, List[dict]]: | |
| """ | |
| Query with relevance feedback to refine results. | |
| Args: | |
| question: User's question | |
| feedback_scores: Optional relevance scores for previous results | |
| top_k: Number of relevant chunks to retrieve | |
| Returns: | |
| Tuple of (answer, citations) | |
| """ | |
| # For now, just use regular query | |
| # TODO: Implement Rocchio algorithm or similar for relevance feedback | |
| return query(question, top_k=top_k) | |
| if __name__ == "__main__": | |
| # Test the vanilla query | |
| test_questions = [ | |
| "What are general machine guarding requirements?", | |
| "How do I perform lockout/tagout procedures?", | |
| "What safety measures are needed for robotic systems?" | |
| ] | |
| for q in test_questions: | |
| print(f"\nQuestion: {q}") | |
| answer, citations = query(q) | |
| print(f"Answer: {answer[:200]}...") | |
| print(f"Citations: {[c['source'] for c in citations]}") | |
| print("-" * 50) |