Spaces:

fmegahed
/

sight_chat

Paused

App Files Files Community

fmegahed commited on Aug 13

Commit

ef821d9

verified ·

1 Parent(s): 94d4926

version 2.0.0

Browse files

Files changed (14) hide show

Dockerfile +67 -26
analytics_db.py +509 -0
app.py +1298 -47
config.py +217 -0
preprocess.py +187 -77
query_bm25.py +336 -0
query_context.py +334 -0
query_dpr.py +279 -0
query_graph.py +372 -99
query_vanilla.py +197 -0
query_vision.py +393 -0
realtime_server.py +402 -0
requirements.txt +63 -10
utils.py +679 -0

Dockerfile CHANGED Viewed

@@ -1,26 +1,67 @@
-FROM python:3.12.11-slim
-# 1) Create and switch to the app directory
-WORKDIR /app
-# 2) Install system dependencies
-RUN apt-get update && \
-    apt-get install -y build-essential curl git && \
-    rm -rf /var/lib/apt/lists/*
-# 3) Copy & install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# 4) Copy all your code and data into the container
-COPY . .
-# 5) Expose Streamlit’s default port
-EXPOSE 8501
-# 6) Healthcheck for Streamlit
-HEALTHCHECK --interval=30s --timeout=5s \
-  CMD curl --fail http://localhost:8501/_stcore/health || exit 1
-# 7) Launch your app.py at root
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.12.11-slim
+# Set environment variables for Python
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# Create app directory and switch to it
+WORKDIR /app
+# Install system dependencies required for your packages
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    libgomp1 \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the entire application
+COPY . .
+# Create necessary directories if they don't exist
+RUN mkdir -p /app/data /app/embeddings /app/graph /app/metadata /var/log/supervisor
+# Create supervisor configuration
+COPY <<EOF /etc/supervisor/conf.d/supervisord.conf
+[supervisord]
+nodaemon=true
+logfile=/var/log/supervisor/supervisord.log
+pidfile=/var/run/supervisord.pid
+[program:realtime_server]
+command=python realtime_server.py --port=7861 --host=0.0.0.0
+directory=/app
+autostart=true
+autorestart=true
+stderr_logfile=/var/log/supervisor/realtime_server.err.log
+stdout_logfile=/var/log/supervisor/realtime_server.out.log
+priority=100
+[program:streamlit]
+command=streamlit run app.py --server.port=7860 --server.address=0.0.0.0 --server.enableXsrfProtection=false --server.enableCORS=false
+directory=/app
+autostart=true
+autorestart=true
+stderr_logfile=/var/log/supervisor/streamlit.err.log
+stdout_logfile=/var/log/supervisor/streamlit.out.log
+priority=200
+EOF
+# Expose both ports (Streamlit on 7860, Realtime API on 7861)
+EXPOSE 7860 7861
+# Health check for both services
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl --fail http://localhost:7860/_stcore/health && curl --fail http://localhost:7861/health || exit 1
+# Use supervisor to run both services
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

analytics_db.py ADDED Viewed

	@@ -0,0 +1,509 @@

+"""
+Analytics Database Module for Query Logging and Performance Tracking.
+Tracks every query, method, answer, and citation for comprehensive analytics.
+"""
+import sqlite3
+import json
+import time
+from datetime import datetime, timedelta
+from typing import List, Dict, Any, Optional, Tuple
+from pathlib import Path
+import logging
+from config import DATA_DIR
+logger = logging.getLogger(__name__)
+# Database path
+ANALYTICS_DB = DATA_DIR / "analytics.db"
+class AnalyticsDB:
+    """Database manager for query analytics and logging."""
+    def __init__(self):
+        self.db_path = ANALYTICS_DB
+        self._init_database()
+    def _init_database(self):
+        """Initialize analytics database with required tables."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        # Main queries table
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS queries (
+                query_id INTEGER PRIMARY KEY AUTOINCREMENT,
+                timestamp TEXT NOT NULL,
+                user_query TEXT NOT NULL,
+                retrieval_method TEXT NOT NULL,
+                answer TEXT NOT NULL,
+                response_time_ms REAL,
+                num_citations INTEGER DEFAULT 0,
+                image_path TEXT,
+                error_message TEXT,
+                top_k_used INTEGER DEFAULT 5,
+                additional_settings TEXT,
+                answer_length INTEGER,
+                session_id TEXT
+            )
+        ''')
+        # Citations table
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS citations (
+                citation_id INTEGER PRIMARY KEY AUTOINCREMENT,
+                query_id INTEGER NOT NULL,
+                source TEXT NOT NULL,
+                citation_type TEXT,
+                relevance_score REAL,
+                bm25_score REAL,
+                rerank_score REAL,
+                similarity_score REAL,
+                url TEXT,
+                path TEXT,
+                rank INTEGER,
+                FOREIGN KEY (query_id) REFERENCES queries (query_id)
+            )
+        ''')
+        # Performance metrics table
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS performance_metrics (
+                metric_id INTEGER PRIMARY KEY AUTOINCREMENT,
+                query_id INTEGER NOT NULL,
+                retrieval_time_ms REAL,
+                generation_time_ms REAL,
+                total_time_ms REAL,
+                chunks_retrieved INTEGER,
+                tokens_estimated INTEGER,
+                FOREIGN KEY (query_id) REFERENCES queries (query_id)
+            )
+        ''')
+        conn.commit()
+        conn.close()
+        logger.info("Analytics database initialized")
+    def log_query(self, user_query: str, method: str, answer: str,
+                  citations: List[Dict], response_time: float = None,
+                  image_path: str = None, error_message: str = None,
+                  top_k: int = 5, additional_settings: Dict = None,
+                  session_id: str = None) -> int:
+        """
+        Log a complete query interaction.
+        Args:
+            user_query: The user's question
+            method: Retrieval method used
+            answer: Generated answer
+            citations: List of citation dictionaries
+            response_time: Time taken in milliseconds
+            image_path: Path to uploaded image (if any)
+            error_message: Error message (if any)
+            top_k: Number of chunks retrieved
+            additional_settings: Method-specific settings
+            session_id: Session identifier
+        Returns:
+            query_id: The ID of the logged query
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        try:
+            # Insert main query record
+            cursor.execute('''
+                INSERT INTO queries (
+                    timestamp, user_query, retrieval_method, answer,
+                    response_time_ms, num_citations, image_path, error_message,
+                    top_k_used, additional_settings, answer_length, session_id
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', (
+                datetime.now().isoformat(),
+                user_query,
+                method,
+                answer,
+                response_time,
+                len(citations),
+                image_path,
+                error_message,
+                top_k,
+                json.dumps(additional_settings) if additional_settings else None,
+                len(answer),
+                session_id
+            ))
+            query_id = cursor.lastrowid
+            # Insert citations
+            for rank, citation in enumerate(citations, 1):
+                cursor.execute('''
+                    INSERT INTO citations (
+                        query_id, source, citation_type, relevance_score,
+                        bm25_score, rerank_score, similarity_score, url, path, rank
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    query_id,
+                    citation.get('source', ''),
+                    citation.get('type', ''),
+                    citation.get('relevance_score'),
+                    citation.get('bm25_score'),
+                    citation.get('rerank_score'),
+                    citation.get('similarity_score'),
+                    citation.get('url'),
+                    citation.get('path'),
+                    rank
+                ))
+            conn.commit()
+            logger.info(f"Logged query {query_id} with {len(citations)} citations")
+            return query_id
+        except Exception as e:
+            logger.error(f"Error logging query: {e}")
+            conn.rollback()
+            return None
+        finally:
+            conn.close()
+    def get_query_stats(self, days: int = 30) -> Dict[str, Any]:
+        """Get comprehensive query statistics."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        since_date = (datetime.now() - timedelta(days=days)).isoformat()
+        try:
+            stats = {}
+            # Total queries
+            cursor.execute('''
+                SELECT COUNT(*) FROM queries
+                WHERE timestamp >= ?
+            ''', (since_date,))
+            stats['total_queries'] = cursor.fetchone()[0]
+            # Method usage
+            cursor.execute('''
+                SELECT retrieval_method, COUNT(*) as count
+                FROM queries
+                WHERE timestamp >= ?
+                GROUP BY retrieval_method
+                ORDER BY count DESC
+            ''', (since_date,))
+            stats['method_usage'] = dict(cursor.fetchall())
+            # Average response times by method
+            cursor.execute('''
+                SELECT retrieval_method, AVG(response_time_ms) as avg_time
+                FROM queries
+                WHERE timestamp >= ? AND response_time_ms IS NOT NULL
+                GROUP BY retrieval_method
+            ''', (since_date,))
+            stats['avg_response_times'] = dict(cursor.fetchall())
+            # Citation statistics
+            cursor.execute('''
+                SELECT AVG(num_citations) as avg_citations,
+                       SUM(num_citations) as total_citations
+                FROM queries
+                WHERE timestamp >= ?
+            ''', (since_date,))
+            result = cursor.fetchone()
+            stats['avg_citations'] = result[0] or 0
+            stats['total_citations'] = result[1] or 0
+            # Citation types
+            cursor.execute('''
+                SELECT c.citation_type, COUNT(*) as count
+                FROM citations c
+                JOIN queries q ON c.query_id = q.query_id
+                WHERE q.timestamp >= ?
+                GROUP BY c.citation_type
+                ORDER BY count DESC
+            ''', (since_date,))
+            stats['citation_types'] = dict(cursor.fetchall())
+            # Error rate
+            cursor.execute('''
+                SELECT
+                    COUNT(CASE WHEN error_message IS NOT NULL THEN 1 END) as errors,
+                    COUNT(*) as total
+                FROM queries
+                WHERE timestamp >= ?
+            ''', (since_date,))
+            result = cursor.fetchone()
+            stats['error_rate'] = (result[0] / result[1]) * 100 if result[1] > 0 else 0
+            # Most common query topics (simple word analysis)
+            cursor.execute('''
+                SELECT user_query FROM queries
+                WHERE timestamp >= ?
+            ''', (since_date,))
+            queries = [row[0].lower() for row in cursor.fetchall()]
+            # Simple keyword extraction
+            keywords = {}
+            for query in queries:
+                words = [word for word in query.split() if len(word) > 3]
+                for word in words:
+                    keywords[word] = keywords.get(word, 0) + 1
+            # Top 10 keywords
+            stats['top_keywords'] = dict(sorted(keywords.items(),
+                                              key=lambda x: x[1],
+                                              reverse=True)[:10])
+            return stats
+        except Exception as e:
+            logger.error(f"Error getting query stats: {e}")
+            return {}
+        finally:
+            conn.close()
+    def get_method_performance(self) -> Dict[str, Dict[str, float]]:
+        """Get detailed performance metrics by method."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        try:
+            cursor.execute('''
+                SELECT
+                    retrieval_method,
+                    AVG(response_time_ms) as avg_response_time,
+                    AVG(num_citations) as avg_citations,
+                    AVG(answer_length) as avg_answer_length,
+                    COUNT(*) as query_count
+                FROM queries
+                WHERE response_time_ms IS NOT NULL
+                GROUP BY retrieval_method
+            ''')
+            results = {}
+            for row in cursor.fetchall():
+                method, avg_time, avg_cites, avg_length, count = row
+                results[method] = {
+                    'avg_response_time': avg_time,
+                    'avg_citations': avg_cites,
+                    'avg_answer_length': avg_length,
+                    'query_count': count
+                }
+            return results
+        except Exception as e:
+            logger.error(f"Error getting method performance: {e}")
+            return {}
+        finally:
+            conn.close()
+    def get_recent_queries(self, limit: int = 20, include_answers: bool = True) -> List[Dict[str, Any]]:
+        """Get recent queries with basic information and optionally full answers."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        try:
+            if include_answers:
+                cursor.execute('''
+                    SELECT query_id, timestamp, user_query, retrieval_method,
+                           answer, answer_length, num_citations, response_time_ms, error_message
+                    FROM queries
+                    ORDER BY timestamp DESC
+                    LIMIT ?
+                ''', (limit,))
+                columns = ['query_id', 'timestamp', 'query', 'method',
+                          'answer', 'answer_length', 'citations', 'response_time', 'error_message']
+            else:
+                cursor.execute('''
+                    SELECT query_id, timestamp, user_query, retrieval_method,
+                           answer_length, num_citations, response_time_ms
+                    FROM queries
+                    ORDER BY timestamp DESC
+                    LIMIT ?
+                ''', (limit,))
+                columns = ['query_id', 'timestamp', 'query', 'method',
+                          'answer_length', 'citations', 'response_time']
+            return [dict(zip(columns, row)) for row in cursor.fetchall()]
+        except Exception as e:
+            logger.error(f"Error getting recent queries: {e}")
+            return []
+        finally:
+            conn.close()
+    def get_query_with_citations(self, query_id: int) -> Dict[str, Any]:
+        """Get full query details including citations."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        try:
+            # Get query details
+            cursor.execute('''
+                SELECT query_id, timestamp, user_query, retrieval_method, answer,
+                       response_time_ms, num_citations, error_message, top_k_used
+                FROM queries WHERE query_id = ?
+            ''', (query_id,))
+            query_row = cursor.fetchone()
+            if not query_row:
+                return {}
+            query_data = {
+                'query_id': query_row[0],
+                'timestamp': query_row[1],
+                'user_query': query_row[2],
+                'method': query_row[3],
+                'answer': query_row[4],
+                'response_time': query_row[5],
+                'num_citations': query_row[6],
+                'error_message': query_row[7],
+                'top_k_used': query_row[8]
+            }
+            # Get citations
+            cursor.execute('''
+                SELECT source, citation_type, relevance_score, bm25_score,
+                       rerank_score, similarity_score, url, path, rank
+                FROM citations WHERE query_id = ?
+                ORDER BY rank
+            ''', (query_id,))
+            citations = []
+            for row in cursor.fetchall():
+                citation = {
+                    'source': row[0],
+                    'type': row[1],
+                    'relevance_score': row[2],
+                    'bm25_score': row[3],
+                    'rerank_score': row[4],
+                    'similarity_score': row[5],
+                    'url': row[6],
+                    'path': row[7],
+                    'rank': row[8]
+                }
+                citations.append(citation)
+            query_data['citations'] = citations
+            return query_data
+        except Exception as e:
+            logger.error(f"Error getting query with citations: {e}")
+            return {}
+        finally:
+            conn.close()
+    def get_query_trends(self, days: int = 30) -> Dict[str, List[Tuple[str, int]]]:
+        """Get query trends over time."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        since_date = (datetime.now() - timedelta(days=days)).isoformat()
+        try:
+            # Queries per day
+            cursor.execute('''
+                SELECT DATE(timestamp) as date, COUNT(*) as count
+                FROM queries
+                WHERE timestamp >= ?
+                GROUP BY DATE(timestamp)
+                ORDER BY date
+            ''', (since_date,))
+            daily_queries = cursor.fetchall()
+            # Method usage trends
+            cursor.execute('''
+                SELECT DATE(timestamp) as date, retrieval_method, COUNT(*) as count
+                FROM queries
+                WHERE timestamp >= ?
+                GROUP BY DATE(timestamp), retrieval_method
+                ORDER BY date, retrieval_method
+            ''', (since_date,))
+            method_trends = {}
+            for date, method, count in cursor.fetchall():
+                if method not in method_trends:
+                    method_trends[method] = []
+                method_trends[method].append((date, count))
+            return {
+                'daily_queries': daily_queries,
+                'method_trends': method_trends
+            }
+        except Exception as e:
+            logger.error(f"Error getting query trends: {e}")
+            return {}
+        finally:
+            conn.close()
+    def get_voice_interaction_stats(self) -> Dict[str, Any]:
+        """Get statistics about voice interactions."""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            # Count voice interactions (those with voice_interaction=true in additional_settings)
+            cursor.execute('''
+                SELECT COUNT(*) as total_voice_queries
+                FROM queries
+                WHERE additional_settings LIKE '%voice_interaction%'
+                   OR session_id LIKE 'voice_%'
+            ''')
+            result = cursor.fetchone()
+            total_voice = result[0] if result else 0
+            # Get voice queries by method
+            cursor.execute('''
+                SELECT retrieval_method, COUNT(*) as count
+                FROM queries
+                WHERE additional_settings LIKE '%voice_interaction%'
+                   OR session_id LIKE 'voice_%'
+                GROUP BY retrieval_method
+            ''')
+            voice_by_method = dict(cursor.fetchall())
+            # Average response time for voice queries
+            cursor.execute('''
+                SELECT AVG(response_time_ms) as avg_response_time
+                FROM queries
+                WHERE (additional_settings LIKE '%voice_interaction%'
+                   OR session_id LIKE 'voice_%')
+                   AND response_time_ms IS NOT NULL
+            ''')
+            result = cursor.fetchone()
+            avg_response_time = result[0] if result and result[0] else 0
+            return {
+                'total_voice_queries': total_voice,
+                'voice_by_method': voice_by_method,
+                'avg_voice_response_time': avg_response_time
+            }
+        except Exception as e:
+            logger.error(f"Error getting voice interaction stats: {e}")
+            return {}
+        finally:
+            conn.close()
+# Global instance
+analytics_db = AnalyticsDB()
+# Convenience functions
+def log_query(user_query: str, method: str, answer: str, citations: List[Dict],
+              **kwargs) -> int:
+    """Log a query to the analytics database."""
+    return analytics_db.log_query(user_query, method, answer, citations, **kwargs)
+def get_analytics_stats(days: int = 30) -> Dict[str, Any]:
+    """Get analytics statistics."""
+    return analytics_db.get_query_stats(days)
+def get_method_performance() -> Dict[str, Dict[str, float]]:
+    """Get method performance metrics."""
+    return analytics_db.get_method_performance()

app.py CHANGED Viewed

@@ -1,82 +1,1333 @@
 import streamlit as st
-from query_graph import query_graph
-# Helper for <details>
 def format_citations_html(chunks):
     html = []
     for idx, (hdr, sc, txt, citation) in enumerate(chunks, start=1):
-        preamble = (
-            f"<p style='font-size:0.9em;'><strong>Preamble:</strong> "
-            f"The text in the following detail is reproduced from [{citation}]. "
-            f"It had a cosine similarity of {sc:.2f} with the user question, "
-            f"and it ranked {idx} among the text chunks in our graph database.</p>"
-        )
         body = txt.replace("\n", "<br>")
         html.append(
             f"<details>"
-            f"<summary>{hdr} (cosine similarity: {sc:.2f})</summary>"
             f"<div style='font-size:0.9em; margin-top:0.5em;'>"
-            f"<strong>Preamble:</strong> The text below is reproduced from {citation}. "
             f"</div>"
-            f"<div style='font-size:0.7em; margin-left:1em; margin-top:0.5em;'>{body}</div>"
             f"</details><br><br>"
         )
     return "<br>".join(html)
 # Sidebar configuration
-st.sidebar.title("About")
 st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
-st.sidebar.markdown("**Version:** V. 0.0.2")
-st.sidebar.markdown("**Date:** July 24, 2025")
-st.sidebar.markdown("**Model:** gpt4o")
 st.sidebar.markdown("---")
 st.sidebar.markdown(
     "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
 )
-# Main interface
-st.set_page_config(page_title="Miami University's SIGHT Chatbot")
-st.title("Chat with SIGHT")
-st.write("Ask questions about machine safeguarding, LOTO, and hazard prevention based on OSHA/CFR's corpus.")
-# Example questions toggled in main window
-with st.expander("Example Questions", expanded=False):
-    st.markdown(
-        "- What are general machine guarding requirements?  \n"
-        "- How do I perform lockout/tagout?  \n"
-        "- Summarize the definition of machine guarding from 29 CFR 1910.211"
     )
-# Initialize chat history
-if 'history' not in st.session_state:
-    st.session_state.history = []
-# User input
-query = st.text_input("Your question:")
-if st.button("Send") and query:
-    answer, sources, chunks = query_graph(query)
-    st.session_state.history.append({
-        'query': query,
-        'answer': answer,
-        'sources': sources,
-        'chunks': chunks
-    })
-# Display chat history
-for entry in st.session_state.history[::-1]:
-    st.markdown(f"**You:** {entry['query']}")
-    st.markdown(f"**Assistant:** {entry['answer']}")
-    st.markdown(format_citations_html(entry['chunks']), unsafe_allow_html=True)
 # Footer
 st.markdown("---")
 st.markdown(
-    "**Disclaimer:** *Powered by a Graph RAG to reduce hallucinations; please verify as it can still make mistakes.*"
-    )
 st.markdown(
-    "**Funding:** *We are thankful for [Ohio BWC/WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)'s funding that made this chat bot possible.*"
-    )

+"""
+Multi-Method RAG System - SIGHT
+Enhanced Streamlit application with method comparison and analytics.
+Directory structure:
+/data/         # Original PDFs, HTML
+/embeddings/   # FAISS, Chroma, DPR vector stores
+/graph/        # Graph database files
+/metadata/     # Image metadata (SQLite or MongoDB)
+"""
 import streamlit as st
+import os
+import logging
+import tempfile
+import time
+import uuid
+from typing import Tuple, List, Dict, Any, Optional
+from pathlib import Path
+# Import all query modules
+from query_graph import query as graph_query, query_graph
+from query_vanilla import query as vanilla_query
+from query_dpr import query as dpr_query
+from query_bm25 import query as bm25_query
+from query_context import query as context_query
+from query_vision import query as vision_query, query_image_only
+from config import *
+from analytics_db import log_query, get_analytics_stats, get_method_performance, analytics_db
+import streamlit.components.v1 as components
+import requests
+logger = logging.getLogger(__name__)
+# Check realtime server health
+@st.cache_data(ttl=30)  # Cache for 30 seconds
+def check_realtime_server_health():
+    """Check if the realtime server is running."""
+    try:
+        response = requests.get("http://localhost:7861/health", timeout=2)
+        return response.status_code == 200
+    except:
+        return False
+# Query method dispatch
+QUERY_DISPATCH = {
+    'graph': graph_query,
+    'vanilla': vanilla_query,
+    'dpr': dpr_query,
+    'bm25': bm25_query,
+    'context': context_query,
+    'vision': vision_query
+}
+# Method options for speech interface
+METHOD_OPTIONS = ['graph', 'vanilla', 'dpr', 'bm25', 'context', 'vision']
 def format_citations_html(chunks):
+    """Format citations for display (backward compatibility)."""
     html = []
     for idx, (hdr, sc, txt, citation) in enumerate(chunks, start=1):
         body = txt.replace("\n", "<br>")
         html.append(
             f"<details>"
+            f"<summary>{hdr} (relevance score: {sc:.3f})</summary>"
             f"<div style='font-size:0.9em; margin-top:0.5em;'>"
+            f"<strong>Source:</strong> {citation} "
             f"</div>"
+            f"<div style='font-size:0.8em; margin-left:1em; margin-top:0.5em;'>{body}</div>"
             f"</details><br><br>"
         )
     return "<br>".join(html)
+def format_citations_html(citations: List[dict], method: str) -> str:
+    """Format citations as HTML based on method and citation type."""
+    if not citations:
+        return "<p><em>No citations available</em></p>"
+    html_parts = ["<div style='margin-top: 1em;'><strong>Sources:</strong><ul>"]
+    for citation in citations:
+        # Skip citations without source
+        if 'source' not in citation:
+            continue
+        source = citation['source']
+        cite_type = citation.get('type', 'unknown')
+        # Build citation text based on type
+        if cite_type == 'pdf':
+            cite_text = f"📄 {source} (PDF)"
+        elif cite_type == 'html':
+            url = citation.get('url', '')
+            if url:
+                cite_text = f"🌐 <a href='{url}' target='_blank'>{source}</a> (Web)"
+            else:
+                cite_text = f"🌐 {source} (Web)"
+        elif cite_type == 'image':
+            page = citation.get('page', 'N/A')
+            cite_text = f"🖼️ {source} (Image, page {page})"
+        elif cite_type == 'image_analysis':
+            classification = citation.get('classification', 'N/A')
+            cite_text = f"🔍 {source} - {classification}"
+        else:
+            cite_text = f"📚 {source}"
+        # Add scores if available
+        scores = []
+        if 'relevance_score' in citation:
+            scores.append(f"relevance: {citation['relevance_score']}")
+        if 'bm25_score' in citation:
+            scores.append(f"BM25: {citation['bm25_score']}")
+        if 'rerank_score' in citation:
+            scores.append(f"rerank: {citation['rerank_score']}")
+        if 'similarity' in citation:
+            scores.append(f"similarity: {citation['similarity']}")
+        if 'score' in citation:
+            scores.append(f"score: {citation['score']:.3f}")
+        if scores:
+            cite_text += f" <small>({', '.join(scores)})</small>"
+        html_parts.append(f"<li>{cite_text}</li>")
+    html_parts.append("</ul></div>")
+    return "".join(html_parts)
+def save_uploaded_file(uploaded_file) -> str:
+    """Save uploaded file to temporary location."""
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            return tmp_file.name
+    except Exception as e:
+        st.error(f"Error saving file: {e}")
+        return None
+# Page configuration
+st.set_page_config(
+    page_title="Multi-Method RAG System - SIGHT",
+    page_icon="🔍",
+    layout="wide"
+)
 # Sidebar configuration
+st.sidebar.title("Configuration")
+# Method selector
+st.sidebar.markdown("### Retrieval Method")
+selected_method = st.sidebar.radio(
+    "Choose retrieval method:",
+    options=['graph', 'vanilla', 'dpr', 'bm25', 'context', 'vision'],
+    format_func=lambda x: x.capitalize(),
+    help="Select different RAG methods to compare results"
+)
+# Display method description
+st.sidebar.info(METHOD_DESCRIPTIONS[selected_method])
+# Advanced settings
+with st.sidebar.expander("Advanced Settings"):
+    top_k = st.slider("Number of chunks to retrieve", min_value=1, max_value=10, value=DEFAULT_TOP_K)
+    if selected_method == 'bm25':
+        use_hybrid = st.checkbox("Use hybrid search (BM25 + semantic)", value=False)
+        if use_hybrid:
+            alpha = st.slider("BM25 weight (alpha)", min_value=0.0, max_value=1.0, value=0.5)
+# Sidebar info
+st.sidebar.markdown("---")
+st.sidebar.markdown("### About")
 st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
+st.sidebar.markdown(f"**Version:** V. {VERSION}")
+st.sidebar.markdown(f"**Date:** {DATE}")
+st.sidebar.markdown(f"**Model:** {OPENAI_CHAT_MODEL}")
 st.sidebar.markdown("---")
 st.sidebar.markdown(
     "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
 )
+# Main interface with dynamic status
+col1, col2 = st.columns([3, 1])
+with col1:
+    st.title("🔍 Multi-Method RAG System - SIGHT")
+    st.markdown("### Compare different retrieval methods for machine safety Q&A")
+with col2:
+    # Quick stats in the header
+    if 'chat_history' in st.session_state:
+        total_queries = len(st.session_state.chat_history)
+        st.metric("Session Queries", total_queries, delta=None if total_queries == 0 else "+1" if total_queries == 1 else f"+{total_queries}")
+    # Voice chat status indicator
+    if st.session_state.get('voice_session_active', False):
+        st.success("🔴 Voice LIVE")
+# Create tabs for different interfaces
+tab1, tab2, tab3, tab4 = st.tabs(["💬 Chat", "📊 Method Comparison", "🔊 Voice Chat", "📈 Analytics"])
+with tab1:
+    # Example questions
+    with st.expander("📝 Example Questions", expanded=False):
+        example_cols = st.columns(2)
+        with example_cols[0]:
+            st.markdown(
+                "**General Safety:**\n"
+                "- What are general machine guarding requirements?\n"
+                "- How do I perform lockout/tagout?\n"
+                "- What is required for emergency stops?"
+            )
+        with example_cols[1]:
+            st.markdown(
+                "**Specific Topics:**\n"
+                "- Summarize robot safety requirements from OSHA\n"
+                "- Compare guard types: fixed vs interlocked\n"
+                "- What are the ANSI standards for machine safety?"
+            )
+    # File uploader for vision method
+    uploaded_file = None
+    if selected_method == 'vision':
+        st.markdown("#### 🖼️ Upload an image for analysis")
+        uploaded_file = st.file_uploader(
+            "Choose an image file",
+            type=['png', 'jpg', 'jpeg', 'bmp', 'gif'],
+            help="Upload an image of safety equipment, signs, or machinery"
+        )
+        if uploaded_file:
+            col1, col2 = st.columns([1, 2])
+            with col1:
+                st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
+    # Initialize session state
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    if 'session_id' not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())[:8]
+    # Chat input
+    query = st.text_input(
+        "Ask a question:",
+        placeholder="E.g., What are the safety requirements for collaborative robots?",
+        key="chat_input"
+    )
+    col1, col2, col3 = st.columns([1, 1, 8])
+    with col1:
+        send_button = st.button("🚀 Send", type="primary", use_container_width=True)
+    with col2:
+        clear_button = st.button("🗑️ Clear", use_container_width=True)
+    if clear_button:
+        st.session_state.chat_history = []
+        st.rerun()
+    if send_button and query:
+        # Save uploaded file if present
+        image_path = None
+        if uploaded_file and selected_method == 'vision':
+            image_path = save_uploaded_file(uploaded_file)
+        # Show spinner while processing
+        with st.spinner(f"Searching using {selected_method.upper()} method..."):
+            start_time = time.time()
+            error_message = None
+            answer = ""
+            citations = []
+            try:
+                # Get the appropriate query function
+                query_func = QUERY_DISPATCH[selected_method]
+                # Call the query function
+                if selected_method == 'vision' and not image_path:
+                    error_message = "Please upload an image for vision-based search"
+                    st.error(error_message)
+                else:
+                    answer, citations = query_func(query, image_path=image_path, top_k=top_k)
+                    # Add to history
+                    st.session_state.chat_history.append({
+                        'query': query,
+                        'answer': answer,
+                        'citations': citations,
+                        'method': selected_method,
+                        'image_path': image_path
+                    })
+            except Exception as e:
+                error_message = str(e)
+                answer = f"Error: {error_message}"
+                st.error(f"Error processing query: {error_message}")
+                st.info("Make sure you've run preprocess.py to generate the required indices.")
+            finally:
+                # Log query to analytics database (always, even on error)
+                response_time = (time.time() - start_time) * 1000  # Convert to ms
+                try:
+                    log_query(
+                        user_query=query,
+                        method=selected_method,
+                        answer=answer,
+                        citations=citations,
+                        response_time=response_time,
+                        image_path=image_path,
+                        error_message=error_message,
+                        top_k=top_k,
+                        session_id=st.session_state.session_id
+                    )
+                except Exception as log_error:
+                    logger.error(f"Failed to log query: {log_error}")
+        # Clean up temp file
+        if image_path and os.path.exists(image_path):
+            os.unlink(image_path)
+    # Display chat history
+    if st.session_state.chat_history:
+        st.markdown("---")
+        st.markdown("### Chat History")
+        for i, entry in enumerate(reversed(st.session_state.chat_history)):
+            with st.container():
+                # User message
+                st.markdown(f"**🧑 You** ({entry['method'].upper()}):")
+                st.markdown(entry['query'])
+                # Assistant response
+                st.markdown("**🤖 Assistant:**")
+                st.markdown(entry['answer'])
+                # Citations
+                st.markdown(format_citations_html(entry['citations'], entry['method']), unsafe_allow_html=True)
+                if i < len(st.session_state.chat_history) - 1:
+                    st.markdown("---")
+with tab2:
+    st.markdown("### Method Comparison")
+    st.markdown("Compare results from different retrieval methods for the same query.")
+    comparison_query = st.text_input(
+        "Enter a query to compare across methods:",
+        placeholder="E.g., What are the requirements for machine guards?",
+        key="comparison_input"
+    )
+    methods_to_compare = st.multiselect(
+        "Select methods to compare:",
+        options=['graph', 'vanilla', 'dpr', 'bm25', 'context'],
+        default=['vanilla', 'bm25'],
+        help="Vision method requires an image and is not included in comparison"
     )
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        compare_button = st.button("🔍 Compare Methods", type="primary")
+    with col2:
+        if 'comparison_results' in st.session_state and st.session_state.comparison_results:
+            if st.button("🪟 Full Screen View", help="View results in a dedicated comparison window"):
+                st.session_state.show_comparison_window = True
+                st.rerun()
+    if compare_button:
+        if comparison_query and methods_to_compare:
+            results = {}
+            progress_bar = st.progress(0)
+            for idx, method in enumerate(methods_to_compare):
+                with st.spinner(f"Running {method.upper()}..."):
+                    start_time = time.time()
+                    error_message = None
+                    try:
+                        query_func = QUERY_DISPATCH[method]
+                        answer, citations = query_func(comparison_query, top_k=top_k)
+                        results[method] = {
+                            'answer': answer,
+                            'citations': citations
+                        }
+                    except Exception as e:
+                        error_message = str(e)
+                        answer = f"Error: {error_message}"
+                        citations = []
+                        results[method] = {
+                            'answer': answer,
+                            'citations': citations
+                        }
+                    finally:
+                        # Log comparison queries too
+                        response_time = (time.time() - start_time) * 1000
+                        try:
+                            log_query(
+                                user_query=comparison_query,
+                                method=method,
+                                answer=results[method]['answer'],
+                                citations=results[method]['citations'],
+                                response_time=response_time,
+                                error_message=error_message,
+                                top_k=top_k,
+                                session_id=st.session_state.session_id,
+                                additional_settings={'comparison_mode': True}
+                            )
+                        except Exception as log_error:
+                            logger.error(f"Failed to log comparison query: {log_error}")
+                progress_bar.progress((idx + 1) / len(methods_to_compare))
+            # Store results in session state for full screen view
+            st.session_state.comparison_results = {
+                'query': comparison_query,
+                'methods': methods_to_compare,
+                'results': results,
+                'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
+            }
+            # Display results in compact columns
+            cols = st.columns(len(methods_to_compare))
+            for idx, (method, col) in enumerate(zip(methods_to_compare, cols)):
+                with col:
+                    st.markdown(f"#### {method.upper()}")
+                    # Use expandable container for full text without truncation
+                    answer = results[method]['answer']
+                    if len(answer) > 800:
+                        # Show first 300 chars, then expandable for full text
+                        st.markdown(answer[:300] + "...")
+                        with st.expander("📖 Show full answer"):
+                            st.markdown(answer)
+                    else:
+                        # Short answers display fully
+                        st.markdown(answer)
+                    st.markdown(format_citations_html(results[method]['citations'], method), unsafe_allow_html=True)
+        else:
+            st.warning("Please enter a query and select at least one method to compare.")
+with tab3:
+    st.markdown("### 🔊 Voice Chat - Hands-free AI Assistant")
+    # Server status check
+    server_healthy = check_realtime_server_health()
+    if server_healthy:
+        st.success("✅ **Voice Server Online** - Ready for voice interactions")
+    else:
+        st.error("❌ **Voice Server Offline** - Please start the realtime server: `python realtime_server.py`")
+        st.code("python realtime_server.py", language="bash")
+        st.stop()
+    st.info(
+        "🎤 **Real-time Voice Interaction**: Speak naturally and get instant responses from your chosen RAG method. "
+        "The AI will automatically transcribe your speech, search the knowledge base, and respond with synthesized voice."
+    )
+    # Voice Chat Status and Configuration
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        # Use the same method from sidebar
+        st.info(f"🔍 **Voice using {selected_method.upper()} method** (change in sidebar)")
+    with col2:
+        # Voice settings (simplified)
+        voice_choice = st.selectbox(
+            "🎙️ AI Voice:",
+            ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+            index=0,
+            help="Select the AI voice for responses"
+        )
+        response_speed = st.slider(
+            "⏱️ Response Speed (seconds):",
+            min_value=1, max_value=5, value=2,
+            help="How quickly the AI should respond after you stop speaking"
+        )
+    # Auto-detect server URL (hide the manual configuration)
+    server_url = "http://localhost:5050"  # Default for local development
+    # Voice Chat Interface
+    st.markdown("---")
+    # Initialize voice chat session state
+    if 'voice_chat_history' not in st.session_state:
+        st.session_state.voice_chat_history = []
+    if 'voice_session_active' not in st.session_state:
+        st.session_state.voice_session_active = False
+    # Simple Status Display
+    if st.session_state.voice_session_active:
+        st.success("🔴 **LIVE** - Voice chat active using " + voice_method.upper())
+    # Enhanced Voice Interface with better UX
+    components.html(f"""
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8" />
+<style>
+  body {{
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    padding: 20px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border-radius: 10px;
+  }}
+  .container {{
+    max-width: 800px;
+    margin: 0 auto;
+    background: rgba(255,255,255,0.1);
+    padding: 30px;
+    border-radius: 15px;
+    backdrop-filter: blur(10px);
+  }}
+  .controls {{
+    display: flex;
+    gap: 20px;
+    align-items: center;
+    justify-content: center;
+    margin-bottom: 30px;
+  }}
+  .status-display {{
+    text-align: center;
+    margin: 20px 0;
+    padding: 15px;
+    border-radius: 10px;
+    background: rgba(255,255,255,0.2);
+  }}
+  .status-idle {{ background: rgba(108, 117, 125, 0.3); }}
+  .status-connecting {{ background: rgba(255, 193, 7, 0.3); }}
+  .status-active {{ background: rgba(40, 167, 69, 0.3); }}
+  .status-error {{ background: rgba(220, 53, 69, 0.3); }}
+  button {{
+    padding: 12px 24px;
+    font-size: 16px;
+    border: none;
+    border-radius: 25px;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    font-weight: bold;
+  }}
+  .start-btn {{
+    background: linear-gradient(45deg, #28a745, #20c997);
+    color: white;
+  }}
+  .start-btn:hover {{ transform: translateY(-2px); box-shadow: 0 4px 12px rgba(40,167,69,0.4); }}
+  .start-btn:disabled {{
+    background: #6c757d;
+    cursor: not-allowed;
+    transform: none;
+    box-shadow: none;
+  }}
+  .stop-btn {{
+    background: linear-gradient(45deg, #dc3545, #fd7e14);
+    color: white;
+  }}
+  .stop-btn:hover {{ transform: translateY(-2px); box-shadow: 0 4px 12px rgba(220,53,69,0.4); }}
+  .stop-btn:disabled {{
+    background: #6c757d;
+    cursor: not-allowed;
+    transform: none;
+    box-shadow: none;
+  }}
+  .log {{
+    height: 200px;
+    overflow-y: auto;
+    border: 1px solid rgba(255,255,255,0.3);
+    padding: 15px;
+    background: rgba(0,0,0,0.2);
+    border-radius: 10px;
+    font-family: 'Monaco', 'Menlo', monospace;
+    font-size: 13px;
+    line-height: 1.4;
+  }}
+  .audio-controls {{
+    text-align: center;
+    margin: 20px 0;
+  }}
+  .pulse {{
+    animation: pulse 2s infinite;
+  }}
+  @keyframes pulse {{
+    0% {{ transform: scale(1); }}
+    50% {{ transform: scale(1.05); }}
+    100% {{ transform: scale(1); }}
+  }}
+  .visualizer {{
+    width: 100%;
+    height: 60px;
+    background: rgba(0,0,0,0.2);
+    border-radius: 10px;
+    margin: 10px 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 14px;
+  }}
+</style>
+</head>
+<body>
+  <div class="container">
+    <div class="status-display status-idle" id="statusDisplay">
+      <h3 id="statusTitle">🎤 Voice Chat</h3>
+      <p id="statusText">Click "Start Listening" to begin</p>
+    </div>
+    <div class="controls">
+      <button id="startBtn" class="start-btn">🎤 Start Listening</button>
+      <button id="stopBtn" class="stop-btn" disabled>⏹️ Stop</button>
+    </div>
+    <div class="audio-controls">
+      <audio id="remoteAudio" autoplay style="width: 100%; max-width: 400px;"></audio>
+    </div>
+    <div class="visualizer" id="visualizer">
+      🔇 Audio will appear here when active
+    </div>
+    <div class="log" id="log"></div>
+  </div>
+<script>
+(async () => {{
+  const serverBase = {server_url!r};
+  const chosenMethod = {selected_method!r};
+  const voiceChoice = {voice_choice!r};
+  const responseSpeed = {response_speed!r};
+  const logEl = document.getElementById('log');
+  const statusDisplay = document.getElementById('statusDisplay');
+  const statusTitle = document.getElementById('statusTitle');
+  const statusText = document.getElementById('statusText');
+  const startBtn = document.getElementById('startBtn');
+  const stopBtn = document.getElementById('stopBtn');
+  const visualizer = document.getElementById('visualizer');
+  let pc, dc, micStream;
+  let isConnected = false;
+  let questionStartTime = null;
+  function updateStatus(status, title, text, className) {{
+    statusDisplay.className = `status-display ${{className}}`;
+    statusTitle.textContent = title;
+    statusText.textContent = text;
+  }}
+  function log(msg, type = 'info') {{
+    const timestamp = new Date().toLocaleTimeString();
+    const icon = type === 'error' ? '❌' : type === 'success' ? '✅' : type === 'warning' ? '⚠️' : 'ℹ️';
+    logEl.innerHTML += `<div>${{timestamp}} ${{icon}} ${{msg}}</div>`;
+    logEl.scrollTop = logEl.scrollHeight;
+  }}
+  async function start() {{
+    startBtn.disabled = true;
+    stopBtn.disabled = false;
+    updateStatus('connecting', '🔄 Connecting...', 'Establishing secure connection to voice services', 'status-connecting');
+    try {{
+      log('Initializing voice session...', 'info');
+      // 1) Fetch ephemeral session token
+      const sessResp = await fetch(serverBase + "/session", {{
+        method: "POST",
+        headers: {{ "Content-Type": "application/json" }},
+        body: JSON.stringify({{ voice: voiceChoice }})
+      }});
+      if (!sessResp.ok) {{
+        throw new Error(`Server error: ${{sessResp.status}} ${{sessResp.statusText}}`);
+      }}
+      const sess = await sessResp.json();
+      if (sess.error) throw new Error(sess.error);
+      const EPHEMERAL_KEY = sess.client_secret;
+      if (!EPHEMERAL_KEY) throw new Error("No ephemeral token from server");
+      log('✅ Session token obtained', 'success');
+      // 2) Setup WebRTC
+      pc = new RTCPeerConnection();
+      const remoteAudio = document.getElementById('remoteAudio');
+      pc.ontrack = (event) => {{
+        log('🔊 Audio track received from OpenAI', 'success');
+        console.log('WebRTC track event:', event);
+        const stream = event.streams[0];
+        if (stream && stream.getAudioTracks().length > 0) {{
+          remoteAudio.srcObject = stream;
+          visualizer.textContent = '🔊 Audio stream connected - AI can speak';
+          log(`🎵 Audio tracks: ${{stream.getAudioTracks().length}}`, 'success');
+        }} else {{
+          log('⚠️ No audio tracks in stream', 'warning');
+          visualizer.textContent = '⚠️ No audio stream received';
+        }}
+      }};
+      // 3) Create data channel
+      dc = pc.createDataChannel("oai-data");
+      dc.onopen = () => {{
+        log('🔗 Data channel established', 'success');
+      }};
+      dc.onerror = (error) => {{
+        log('❌ Data channel error: ' + error, 'error');
+      }};
+      dc.onmessage = (e) => handleDataMessage(e);
+      // 4) Get microphone
+      log('🎤 Requesting microphone access...', 'info');
+      micStream = await navigator.mediaDevices.getUserMedia({{ audio: true }});
+      log('✅ Microphone access granted', 'success');
+      visualizer.textContent = '🎤 Microphone active - speak naturally';
+      for (const track of micStream.getTracks()) {{
+        pc.addTrack(track, micStream);
+      }}
+      // 5) Setup audio receiving
+      pc.addTransceiver("audio", {{ direction: "recvonly" }});
+      log('🔊 Audio receiver configured', 'success');
+      // 6) Create and set local description
+      const offer = await pc.createOffer();
+      await pc.setLocalDescription(offer);
+      log('📡 WebRTC offer created', 'success');
+      // 7) Exchange SDP with OpenAI Realtime
+      const baseUrl = "https://api.openai.com/v1/realtime";
+      const model = sess.model || "gpt-4o-realtime-preview";
+      const sdpResp = await fetch(`${{baseUrl}}?model=${{encodeURIComponent(model)}}`, {{
+        method: "POST",
+        body: offer.sdp,
+        headers: {{
+          Authorization: `Bearer ${{EPHEMERAL_KEY}}`,
+          "Content-Type": "application/sdp"
+        }}
+      }});
+      if (!sdpResp.ok) throw new Error(`WebRTC setup failed: ${{sdpResp.status}}`);
+      const answer = {{ type: "answer", sdp: await sdpResp.text() }};
+      await pc.setRemoteDescription(answer);
+      // 8) Configure the session with tools and faster response
+      // Wait a bit to ensure data channel is fully ready
+      setTimeout(() => {{
+        if (dc.readyState === 'open') {{
+          const toolDecl = {{
+            type: "session.update",
+            session: {{
+              tools: [{{
+                "type": "function",
+                "name": "ask_rag",
+                "description": "Search the safety knowledge base for accurate, authoritative information. Call this immediately when users ask safety questions to get current, reliable information with proper citations.",
+                "parameters": {{
+                  "type": "object",
+                  "properties": {{
+                    "query": {{ "type": "string", "description": "User's safety question" }},
+                    "top_k": {{ "type": "integer", "minimum": 1, "maximum": 20, "default": 5 }}
+                  }},
+                  "required": ["query"]
+                }}
+              }}],
+              turn_detection: {{
+                type: "server_vad",
+                threshold: 0.5,
+                prefix_padding_ms: 300,
+                silence_duration_ms: {response_speed * 1000}
+              }},
+              input_audio_transcription: {{
+                model: "whisper-1"
+              }},
+              voice: voiceChoice,
+              temperature: 0.7,  // Higher temperature for more natural speech
+              max_response_output_tokens: 1000,  // Allow full responses
+              modalities: ["audio", "text"],
+              response_format: "audio"
+            }}
+          }};
+          dc.send(JSON.stringify(toolDecl));
+          log('🛠️ RAG tools configured', 'success');
+          // Send initial conversation starter to prime the model for natural interaction
+          const initialMessage = {{
+            type: "conversation.item.create",
+            item: {{
+              type: "message",
+              role: "user",
+              content: [{{
+                type: "input_text",
+                text: "Hello! I'm ready to ask you questions about machine safety. Please speak naturally like a safety expert - no need to mention specific documents or sources, just give me the information as your expertise."
+              }}]
+            }}
+          }};
+          dc.send(JSON.stringify(initialMessage));
+          const responseRequest = {{
+            type: "response.create",
+            response: {{
+              modalities: ["audio"],
+              instructions: "Acknowledge briefly that you're ready to help with safety questions. Speak naturally and confidently as a safety expert - no citations or document references needed."
+            }}
+          }};
+          dc.send(JSON.stringify(responseRequest));
+        }} else {{
+          log('⚠️ Data channel not ready, retrying...', 'warning');
+          // Retry after another second
+          setTimeout(() => {{
+            if (dc.readyState === 'open') {{
+              dc.send(JSON.stringify(toolDecl));
+              log('🛠️ RAG tools configured (retry)', 'success');
+            }}
+          }}, 1000);
+        }}
+      }}, 500);
+      isConnected = true;
+      updateStatus('active', '🎤 Live - Speak Now!', `Using ${{chosenMethod.toUpperCase()}} method • Voice: ${{voiceChoice}} • Response: ${{responseSpeed}}s`, 'status-active');
+      startBtn.classList.add('pulse');
+    }} catch (error) {{
+      log(`❌ Connection failed: ${{error.message}}`, 'error');
+      updateStatus('error', '❌ Connection Failed', error.message, 'status-error');
+      startBtn.disabled = false;
+      stopBtn.disabled = true;
+      cleanup();
+    }}
+  }}
+  function cleanup() {{
+    try {{
+      if (dc && dc.readyState === 'open') dc.close();
+      if (pc) pc.close();
+      if (micStream) micStream.getTracks().forEach(t => t.stop());
+    }} catch (e) {{ /* ignore cleanup errors */ }}
+    startBtn.classList.remove('pulse');
+    visualizer.textContent = '🔇 Audio inactive';
+  }}
+  async function stop() {{
+    startBtn.disabled = false;
+    stopBtn.disabled = true;
+    isConnected = false;
+    updateStatus('idle', '⚪ Session Ended', 'Click "Start Listening" to begin a new voice session', 'status-idle');
+    log('🛑 Voice session terminated', 'info');
+    cleanup();
+  }}
+  // Handle realtime events
+  async function handleDataMessage(e) {{
+    if (!isConnected) return;
+    try {{
+      const msg = JSON.parse(e.data);
+      if (msg.type === "response.function_call") {{
+        const {{ name, call_id, arguments: args }} = msg;
+        if (name === "ask_rag") {{
+          visualizer.textContent = '✅ Question received - searching...';
+          const query = JSON.parse(args || "{{}}").query;
+          log(`✅ AI heard: "${{query}}"`, 'success');
+          log('🔍 Searching knowledge base...', 'info');
+          // Store the transcribed query for analytics
+          window.lastVoiceQuery = query;
+          const payload = JSON.parse(args || "{{}}");
+          const ragResp = await fetch("${{serverBase}}/rag", {{
+            method: "POST",
+            headers: {{ "Content-Type": "application/json" }},
+            body: JSON.stringify({{
+              query: payload.query,
+              top_k: payload.top_k ?? 5,
+              method: chosenMethod
+            }})
+          }});
+          const rag = await ragResp.json();
+          // Send result back to model (check if data channel is still open)
+          if (dc && dc.readyState === 'open') {{
+            dc.send(JSON.stringify({{
+              type: "response.function_call_result",
+              call_id,
+              output: JSON.stringify({{
+                answer: rag.answer,
+                instruction: "Speak this information naturally as your expertise. Do not mention sources or documents."
+              }})
+            }}));
+          }} else {{
+            log('⚠️ Data channel closed, cannot send result', 'warning');
+          }}
+          const searchTime = ((Date.now() - questionStartTime) / 1000).toFixed(1);
+          log(`✅ Found ${{rag.citations?.length || 0}} citations in ${{searchTime}}s`, 'success');
+          visualizer.textContent = '🎙️ AI is speaking your answer...';
+        }}
+      }}
+      if (msg.type === "input_audio_buffer.speech_started") {{
+        questionStartTime = Date.now();
+        visualizer.textContent = '🎙️ Listening to you...';
+        log('🎤 Speech detected', 'info');
+      }}
+      if (msg.type === "input_audio_buffer.speech_stopped") {{
+        visualizer.textContent = '🤔 Processing your question...';
+        log('⏸️ Processing speech...', 'info');
+      }}
+      if (msg.type === "response.audio.delta") {{
+        visualizer.textContent = '🔊 AI speaking...';
+      }}
+      if (msg.type === "response.done") {{
+        if (questionStartTime) {{
+          const totalTime = ((Date.now() - questionStartTime) / 1000).toFixed(1);
+          visualizer.textContent = '🎤 Your turn - speak now';
+          log(`✅ Response complete in ${{totalTime}}s`, 'success');
+          questionStartTime = null;
+        }} else {{
+          visualizer.textContent = '🎤 Your turn - speak now';
+          log('✅ Response complete', 'success');
+        }}
+      }}
+    }} catch (err) {{
+      // Ignore non-JSON messages
+    }}
+  }}
+  startBtn.onclick = start;
+  stopBtn.onclick = stop;
+  // Initialize
+  log('🚀 Voice chat interface loaded', 'success');
+}})();
+</script>
+</body>
+</html>
+""", height=600, scrolling=True)
+    # Voice Chat History
+    if st.session_state.voice_chat_history:
+        st.markdown("### 🗣️ Recent Voice Conversations")
+        for i, entry in enumerate(reversed(st.session_state.voice_chat_history[-5:])):
+            with st.expander(f"🎤 Conversation {len(st.session_state.voice_chat_history)-i} - {entry.get('method', 'unknown').upper()}"):
+                st.write(f"**Query**: {entry.get('query', 'N/A')}")
+                st.write(f"**Response**: {entry.get('answer', 'N/A')[:200]}...")
+                st.write(f"**Citations**: {len(entry.get('citations', []))}")
+                st.write(f"**Timestamp**: {entry.get('timestamp', 'N/A')}")
+with tab4:
+    st.markdown("### 📊 Analytics Dashboard")
+    st.markdown("*Persistent analytics from all user interactions*")
+    # Time period selector
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.markdown("")
+    with col2:
+        days_filter = st.selectbox("Time Period", [7, 30, 90, 365], index=1, format_func=lambda x: f"Last {x} days")
+    # Get analytics data
+    try:
+        stats = get_analytics_stats(days=days_filter)
+        performance = get_method_performance()
+        recent_queries = analytics_db.get_recent_queries(limit=10)
+        # Overview Metrics
+        st.markdown("#### 📈 Overview")
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric(
+                "Total Queries",
+                stats.get('total_queries', 0),
+                help="All queries processed in the selected time period"
+            )
+        with col2:
+            avg_citations = stats.get('avg_citations', 0)
+            st.metric(
+                "Avg Citations",
+                f"{avg_citations:.1f}",
+                help="Average number of citations per query"
+            )
+        with col3:
+            error_rate = stats.get('error_rate', 0)
+            st.metric(
+                "Success Rate",
+                f"{100 - error_rate:.1f}%",
+                delta=f"-{error_rate:.1f}% errors" if error_rate > 0 else None,
+                help="Percentage of successful queries"
+            )
+        with col4:
+            total_citations = stats.get('total_citations', 0)
+            st.metric(
+                "Total Citations",
+                total_citations,
+                help="Total citations generated across all queries"
+            )
+        # Method Performance Comparison
+        if performance:
+            st.markdown("#### ⚡ Method Performance")
+            perf_data = []
+            for method, metrics in performance.items():
+                perf_data.append({
+                    'Method': method.upper(),
+                    'Avg Response Time (ms)': f"{metrics['avg_response_time']:.0f}",
+                    'Avg Citations': f"{metrics['avg_citations']:.1f}",
+                    'Avg Answer Length': f"{metrics['avg_answer_length']:.0f}",
+                    'Query Count': int(metrics['query_count'])
+                })
+            if perf_data:
+                st.dataframe(perf_data, use_container_width=True, hide_index=True)
+        # Method Usage with Voice Interaction Indicator
+        method_usage = stats.get('method_usage', {})
+        if method_usage:
+            st.markdown("#### 🎯 Method Usage Distribution")
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.bar_chart(method_usage)
+            with col2:
+                st.markdown("**Most Popular Methods:**")
+                sorted_methods = sorted(method_usage.items(), key=lambda x: x[1], reverse=True)
+                for i, (method, count) in enumerate(sorted_methods[:3], 1):
+                    percentage = (count / sum(method_usage.values())) * 100
+                    st.markdown(f"{i}. **{method.upper()}** - {count} queries ({percentage:.1f}%)")
+                # Voice interaction stats
+                try:
+                    voice_queries = analytics_db.get_voice_interaction_stats()
+                    if voice_queries and voice_queries.get('total_voice_queries', 0) > 0:
+                        st.markdown("---")
+                        st.markdown("**🎤 Voice Interactions:**")
+                        st.markdown(f"🔊 Voice queries: {voice_queries['total_voice_queries']}")
+                        if voice_queries.get('avg_voice_response_time', 0) > 0:
+                            st.markdown(f"⏱️ Avg response time: {voice_queries['avg_voice_response_time']:.1f}ms")
+                        if sum(method_usage.values()) > 0:
+                            voice_percentage = (voice_queries['total_voice_queries'] / sum(method_usage.values())) * 100
+                            st.markdown(f"📊 Voice usage: {voice_percentage:.1f}%")
+                except Exception as e:
+                    logger.error(f"Voice stats error: {e}")
+                    pass
+        # Voice Analytics Section (if voice interactions exist)
+        try:
+            voice_queries = analytics_db.get_voice_interaction_stats()
+            if voice_queries and voice_queries.get('total_voice_queries', 0) > 0:
+                st.markdown("#### 🎤 Voice Interaction Analytics")
+                col1, col2 = st.columns([2, 1])
+                with col1:
+                    voice_by_method = voice_queries.get('voice_by_method', {})
+                    if voice_by_method:
+                        st.bar_chart(voice_by_method)
+                    else:
+                        st.info("No voice method breakdown available yet")
+                with col2:
+                    st.markdown("**Voice Stats:**")
+                    total_voice = voice_queries['total_voice_queries']
+                    st.markdown(f"🔊 Total voice queries: {total_voice}")
+                    avg_response = voice_queries.get('avg_voice_response_time', 0)
+                    if avg_response > 0:
+                        st.markdown(f"⏱️ Avg response: {avg_response:.1f}ms")
+                    # Most used voice method
+                    if voice_by_method:
+                        most_used_voice = max(voice_by_method.items(), key=lambda x: x[1])
+                        st.markdown(f"🎯 Top voice method: {most_used_voice[0].upper()}")
+        except Exception as e:
+            logger.error(f"Voice analytics error: {e}")
+        # Citation Analysis
+        citation_types = stats.get('citation_types', {})
+        if citation_types:
+            st.markdown("#### 📚 Citation Sources")
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                # Filter out empty/null citation types
+                filtered_citations = {k: v for k, v in citation_types.items() if k and k.strip()}
+                if filtered_citations:
+                    st.bar_chart(filtered_citations)
+            with col2:
+                st.markdown("**Source Breakdown:**")
+                total_citations = sum(citation_types.values())
+                for cite_type, count in sorted(citation_types.items(), key=lambda x: x[1], reverse=True):
+                    if cite_type and cite_type.strip():
+                        percentage = (count / total_citations) * 100
+                        icon = "📄" if cite_type == "pdf" else "🌐" if cite_type == "html" else "🖼️" if cite_type == "image" else "📚"
+                        st.markdown(f"{icon} **{cite_type.title()}**: {count} ({percentage:.1f}%)")
+        # Popular Keywords
+        keywords = stats.get('top_keywords', {})
+        if keywords:
+            st.markdown("#### 🔍 Popular Query Topics")
+            col1, col2, col3 = st.columns(3)
+            keyword_items = list(keywords.items())
+            for i, (word, count) in enumerate(keyword_items[:9]):  # Top 9 keywords
+                col = [col1, col2, col3][i % 3]
+                with col:
+                    st.metric(word.title(), count)
+        # Recent Queries with Responses
+        if recent_queries:
+            st.markdown("#### 🕒 Recent Queries & Responses")
+            for query in recent_queries[:5]:  # Show last 5
+                # Create expander title with query preview
+                query_preview = query['query'][:60] + "..." if len(query['query']) > 60 else query['query']
+                expander_title = f"🧑 **{query['method'].upper()}**: {query_preview}"
+                with st.expander(expander_title):
+                    # Query details
+                    st.markdown(f"**📝 Full Query:** {query['query']}")
+                    # Metrics row
+                    col1, col2, col3, col4 = st.columns(4)
+                    with col1:
+                        st.metric("Answer Length", f"{query['answer_length']} chars")
+                    with col2:
+                        st.metric("Citations", query['citations'])
+                    with col3:
+                        if query['response_time']:
+                            st.metric("Response Time", f"{query['response_time']:.0f}ms")
+                        else:
+                            st.metric("Response Time", "N/A")
+                    with col4:
+                        status = "❌ Error" if query.get('error_message') else "✅ Success"
+                        st.markdown(f"**Status:** {status}")
+                    # Show error message if exists
+                    if query.get('error_message'):
+                        st.error(f"**Error:** {query['error_message']}")
+                    else:
+                        # Show answer in a styled container
+                        st.markdown("**🤖 Response:**")
+                        answer = query.get('answer', 'No answer available')
+                        # Truncate very long answers for better UX
+                        if len(answer) > 1000:
+                            st.markdown(
+                                f'<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745;">'
+                                f'{answer[:500].replace(chr(10), "<br>")}<br><br>'
+                                f'<i>... (truncated, showing first 500 chars of {len(answer)} total)</i>'
+                                f'</div>',
+                                unsafe_allow_html=True
+                            )
+                            # Option to view full answer
+                            if st.button(f"📖 View Full Answer", key=f"full_answer_{query['query_id']}"):
+                                st.markdown("**Full Answer:**")
+                                st.markdown(
+                                    f'<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; max-height: 400px; overflow-y: auto;">'
+                                    f'{answer.replace(chr(10), "<br>")}'
+                                    f'</div>',
+                                    unsafe_allow_html=True
+                                )
+                        else:
+                            # Short answers display fully
+                            st.markdown(
+                                f'<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745;">'
+                                f'{answer.replace(chr(10), "<br>")}'
+                                f'</div>',
+                                unsafe_allow_html=True
+                            )
+                        # Show detailed citation info
+                        if query['citations'] > 0:
+                            if st.button(f"📚 View Citations", key=f"citations_{query['query_id']}"):
+                                detailed_query = analytics_db.get_query_with_citations(query['query_id'])
+                                if detailed_query and 'citations' in detailed_query:
+                                    st.markdown("**Citations:**")
+                                    for i, citation in enumerate(detailed_query['citations'], 1):
+                                        scores = []
+                                        if citation.get('relevance_score'):
+                                            scores.append(f"relevance: {citation['relevance_score']:.3f}")
+                                        if citation.get('bm25_score'):
+                                            scores.append(f"BM25: {citation['bm25_score']:.3f}")
+                                        if citation.get('rerank_score'):
+                                            scores.append(f"rerank: {citation['rerank_score']:.3f}")
+                                        score_text = f" ({', '.join(scores)})" if scores else ""
+                                        st.markdown(f"{i}. **{citation['source']}** {score_text}")
+                    st.markdown(f"**🕐 Timestamp:** {query['timestamp']}")
+                    st.markdown("---")
+        # Session Info
+        st.markdown("---")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.markdown("*Analytics are updated in real-time and persist across sessions*")
+        with col2:
+            st.markdown(f"**Session ID:** `{st.session_state.session_id}`")
+    except Exception as e:
+        st.error(f"Error loading analytics: {e}")
+        st.info("Analytics data will appear after your first query. The database is created automatically.")
+        # Fallback to session analytics
+        if st.session_state.chat_history:
+            st.markdown("#### 📊 Current Session")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Session Queries", len(st.session_state.chat_history))
+            with col2:
+                methods_used = [entry['method'] for entry in st.session_state.chat_history]
+                most_used = max(set(methods_used), key=methods_used.count) if methods_used else "N/A"
+                st.metric("Most Used Method", most_used.upper() if most_used != "N/A" else most_used)
+# Full Screen Comparison Window (Modal-like)
+if st.session_state.get('show_comparison_window', False):
+    st.markdown("---")
+    # Header with close button
+    col1, col2 = st.columns([4, 1])
+    with col1:
+        comparison_data = st.session_state.comparison_results
+        st.markdown(f"## 🪟 Full Screen Comparison")
+        st.markdown(f"**Query:** {comparison_data['query']}")
+        st.markdown(f"**Generated:** {comparison_data['timestamp']} | **Methods:** {', '.join([m.upper() for m in comparison_data['methods']])}")
+    with col2:
+        if st.button("✖️ Close", help="Close full screen view"):
+            st.session_state.show_comparison_window = False
+            st.rerun()
+    st.markdown("---")
+    # Full-width comparison display
+    results = comparison_data['results']
+    methods = comparison_data['methods']
+    for method in methods:
+        st.markdown(f"### 🔸 {method.upper()} Method")
+        # Answer
+        answer = results[method]['answer']
+        st.markdown("**Answer:**")
+        # Use a container with custom styling for better readability
+        with st.container():
+            st.markdown(
+                f'<div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; margin: 10px 0; border-left: 5px solid #1f77b4;">'
+                f'{answer.replace(chr(10), "<br>")}'
+                f'</div>',
+                unsafe_allow_html=True
+            )
+        # Citations
+        st.markdown("**Citations:**")
+        st.markdown(format_citations_html(results[method]['citations'], method), unsafe_allow_html=True)
+        # Statistics
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Answer Length", f"{len(answer)} chars")
+        with col2:
+            st.metric("Citations", len(results[method]['citations']))
+        with col3:
+            word_count = len(answer.split())
+            st.metric("Word Count", word_count)
+        if method != methods[-1]:  # Not the last method
+            st.markdown("---")
+    # Summary comparison table
+    st.markdown("### 📊 Method Comparison Summary")
+    summary_data = []
+    for method in methods:
+        summary_data.append({
+            'Method': method.upper(),
+            'Answer Length (chars)': len(results[method]['answer']),
+            'Word Count': len(results[method]['answer'].split()),
+            'Citations': len(results[method]['citations']),
+            'Avg Citation Score': round(
+                sum(float(c.get('relevance_score', 0) or c.get('score', 0) or 0)
+                    for c in results[method]['citations']) / len(results[method]['citations'])
+                if results[method]['citations'] else 0, 3
+            )
+        })
+    st.dataframe(summary_data, use_container_width=True, hide_index=True)
+    st.markdown("---")
+    # Return to normal view button
+    col1, col2, col3 = st.columns([2, 1, 2])
+    with col2:
+        if st.button("⬅️ Back to Comparison Tab", type="primary", use_container_width=True):
+            st.session_state.show_comparison_window = False
+            st.rerun()
+    st.stop()  # Stop rendering the rest of the app when in full screen mode
 # Footer
 st.markdown("---")
 st.markdown(
+    "**⚠️ Disclaimer:** *This system uses AI to retrieve and generate responses. "
+    "While we strive for accuracy, please verify critical safety information with official sources.*"
+)
 st.markdown(
+    "**🙏 Acknowledgment:** *We thank [Ohio BWC/WSIC](https://info.bwc.ohio.gov/) "
+    "for funding that made this multi-method RAG system possible.*"
+)

config.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+Central configuration file for the Multi-Method RAG System.
+All shared parameters and settings are defined here.
+"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv(override=True)
+# ==================== Versioning and Date ====================
+DATE = "August 13, 2025"
+VERSION = "2.0.1"
+# ==================== API Configuration ====================
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_CHAT_MODEL = "gpt-5-chat-latest"  # This is the non-reasoning model for gpt-5 so it has no latency
+OPENAI_EMBEDDING_MODEL = "text-embedding-3-large"  # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002
+# ==================== Realtime API Configuration ====================
+# OpenAI Realtime API settings for speech-to-speech functionality
+OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview"  # Realtime model for speech-to-speech
+REALTIME_VOICE = "alloy"  # Available voices: alloy, echo, fable, onyx, nova, shimmer
+REALTIME_INSTRUCTIONS = (
+    "You are a knowledgeable safety expert speaking naturally in conversation. "
+    "VOICE BEHAVIOR: "
+    "- Speak like a confident safety professional talking to a colleague "
+    "- Acknowledge what you heard: 'You're asking about [topic]...' "
+    "- Use natural speech with appropriate pauses and emphasis "
+    "- Sound authoritative and knowledgeable - you ARE the expert "
+    "- Never mention document names, page numbers, or citation details when speaking "
+    "- Just state the facts naturally as if you know them from your expertise "
+    "RESPONSE PROCESS: "
+    "1. Briefly acknowledge the question: 'You're asking about [topic]...' "
+    "2. Call ask_rag to get the accurate information "
+    "3. Speak the information naturally as YOUR expertise, not as 'according to document X' "
+    "4. Organize complex topics: 'There are three key requirements here...' "
+    "5. Be thorough but conversational - like explaining to a colleague "
+    "CITATION RULE: "
+    "NEVER mention specific documents, sources, or page numbers in speech. "
+    "Just state the information confidently as if it's your professional knowledge. "
+    "For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' "
+    "IMPORTANT: Always use ask_rag for safety questions to get accurate information, "
+    "but speak the results as your own expertise, not as citations."
+)
+# ==================== Model Parameters ====================
+# Generation parameters
+DEFAULT_TEMPERATURE = 0  # Range: 0.0-1.0 (0=deterministic, 1=creative)
+DEFAULT_MAX_TOKENS = 5000  # Maximum tokens in response
+DEFAULT_TOP_K = 5  # Number of chunks to retrieve by default
+DEFAULT_TOP_P = 1.0  # Nucleus sampling parameter
+# Context window management
+MAX_CONTEXT_TOKENS = 7500  # Maximum context for models with 8k window
+CHUNK_SIZE = 2000  # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens)
+CHUNK_OVERLAP = 200  # Token overlap between chunks
+# ==================== Embedding Models ====================
+# Sentence Transformers models
+SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'  # For DPR
+CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'  # For re-ranking
+# CLIP model
+CLIP_MODEL = "ViT-L/14"  # Options: ViT-B/32, ViT-L/14, RN50
+# ==================== Search Parameters ====================
+# BM25 parameters
+BM25_K1 = 1.5  # Term frequency saturation parameter
+BM25_B = 0.75  # Length normalization parameter
+# Hybrid search
+DEFAULT_HYBRID_ALPHA = 0.5  # Weight for BM25 (1-alpha for semantic)
+# Re-ranking
+RERANK_MULTIPLIER = 2  # Retrieve this many times top_k for re-ranking
+MIN_RELEVANCE_SCORE = 0.3  # Minimum score threshold
+# ==================== Directory Structure ====================
+# Project directories
+PROJECT_ROOT = Path(__file__).parent
+DATA_DIR = PROJECT_ROOT / "data"
+EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings"
+GRAPH_DIR = PROJECT_ROOT / "graph"
+METADATA_DIR = PROJECT_ROOT / "metadata"
+IMAGES_DIR = DATA_DIR / "images"
+# File paths
+VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index"
+VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl"
+DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index"
+DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl"
+BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl"
+CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl"
+GRAPH_FILE = GRAPH_DIR / "graph.gml"
+IMAGES_DB = METADATA_DIR / "images.db"
+CHROMA_PATH = EMBEDDINGS_DIR / "chroma"
+# ==================== Batch Processing ====================
+EMBEDDING_BATCH_SIZE = 100  # Batch size for OpenAI embeddings
+PROCESSING_BATCH_SIZE = 50  # Documents to process at once
+# ==================== UI Configuration ====================
+# Streamlit settings
+MAX_CHAT_HISTORY = 5  # Maximum chat messages to keep
+EXAMPLE_QUESTIONS = [
+    "What are general machine guarding requirements?",
+    "How do I perform lockout/tagout?",
+    "What safety measures are needed for robotic systems?",
+    "Explain the difference between guards and devices in machine safety.",
+    "What are the OSHA requirements for emergency stops?",
+]
+# Default method
+DEFAULT_METHOD = "graph"
+# Method descriptions for UI
+METHOD_DESCRIPTIONS = {
+    'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
+    'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
+    'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
+    'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
+    'context': "Context stuffing with full document loading and heuristic selection",
+    'vision': "Vision-based search using GPT-5 Vision for image analysis and classification"
+}
+# ==================== Document Processing ====================
+# Document types
+SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html']
+IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']
+# Text splitting
+MARKDOWN_HEADER_LEVEL = 3  # Split by this header level (###)
+MAX_SECTIONS_PER_DOC = 500  # Maximum sections to extract from a document
+# ==================== Logging ====================
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")  # DEBUG, INFO, WARNING, ERROR
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+# ==================== Performance ====================
+# Device configuration
+import torch
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+NUM_WORKERS = 4  # Parallel processing workers
+# Cache settings
+ENABLE_CACHE = True
+CACHE_TTL = 3600  # Cache time-to-live in seconds
+# ==================== Safety & Validation ====================
+# Input validation
+MAX_QUESTION_LENGTH = 1000  # Maximum characters in a question
+MAX_IMAGE_SIZE_MB = 10  # Maximum image file size
+# Rate limiting (if needed)
+RATE_LIMIT_ENABLED = False
+MAX_QUERIES_PER_MINUTE = 60
+# ==================== Default HTML Sources ====================
+DEFAULT_HTML_SOURCES = [
+    {
+        "title": "NIOSH Robotics in the Workplace – Safety Overview",
+        "url": "https://www.cdc.gov/niosh/robotics/about/",
+        "source": "NIOSH",
+        "year": 2024,
+        "category": "Technical Guide",
+        "format": "HTML"
+    }
+]
+# ==================== Helper Functions ====================
+def ensure_directories():
+    """Create all required directories if they don't exist."""
+    for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]:
+        directory.mkdir(parents=True, exist_ok=True)
+def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int:
+    """Get the context length for a given model."""
+    context_lengths = {
+        "gpt-5": 128000,
+        "gpt-4o-mini": 8192,
+        "gpt-4o": 128000,
+    }
+    return context_lengths.get(model_name, 4096)
+def validate_api_key():
+    """Check if OpenAI API key is set."""
+    if not OPENAI_API_KEY:
+        raise ValueError(
+            "OpenAI API key not found. Please set OPENAI_API_KEY in .env file."
+        )
+    return True
+# ==================== System Info ====================
+def print_config():
+    """Print current configuration for debugging."""
+    print("="*50)
+    print("RAG System Configuration")
+    print("="*50)
+    print(f"OpenAI Model: {OPENAI_CHAT_MODEL}")
+    print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}")
+    print(f"Device: {DEVICE}")
+    print(f"Default Temperature: {DEFAULT_TEMPERATURE}")
+    print(f"Default Top-K: {DEFAULT_TOP_K}")
+    print(f"Chunk Size: {CHUNK_SIZE}")
+    print(f"Project Root: {PROJECT_ROOT}")
+    print("="*50)
+# Ensure directories exist on import
+ensure_directories()

preprocess.py CHANGED Viewed

@@ -1,85 +1,195 @@
-import os
-import re
-import glob
-from dotenv import load_dotenv
-import requests
-from bs4 import BeautifulSoup
-import pandas as pd
-import pymupdf4llm
-import networkx as nx
-from openai import OpenAI
-# Load environment and initialize OpenAI client
-load_dotenv(override=True)
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-# Helper: split Markdown text by third-level headers
-def split_by_header(md_text):
-    parts = re.split(r'(?m)^### ', md_text)
-    return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
-# Initialize graph database
-G = nx.Graph()
-# Process local PDFs
-for pdf_path in glob.glob("scrapped_data/*.pdf"):
-    filename = os.path.basename(pdf_path)
-    title = os.path.splitext(filename)[0]
-    # Convert PDF to Markdown
-    md_text = pymupdf4llm.to_markdown(pdf_path)
-    # Split into sections
-    sections = split_by_header(md_text)
-    for idx, sec in enumerate(sections):
-        resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
-        vector = resp.data[0].embedding
-        node_id = f"PDF::{title}::section{idx}"
-        # Store the local file path for citation
-        G.add_node(node_id,
-                   text=sec,
-                   embedding=vector,
-                   source=title,
-                   path=pdf_path)
-# HTML Document List
-html_data = [
-  {
-    "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
-    "url": "https://www.cdc.gov/niosh/robotics/about/",
-    "source": "NIOSH",
-    "year": 2024,
-    "category": "Technical Guide",
-    "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
-    "format": "HTML"
-  }
-]
-# Process HTML sources
-def process_html(item):
-    resp = requests.get(item['url'])
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, 'html.parser')
-    # Extract paragraph texts
-    texts = [p.get_text() for p in soup.find_all('p')]
-    # Extract tables as markdown
-    tables = []
-    for t in soup.find_all('table'):
-        df = pd.read_html(str(t))[0]
-        tables.append(df.to_markdown())
-    # Join paragraphs and tables with double newlines
-    full = "\n\n".join(texts + tables)
-    # Embed the combined text
-    resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
-    vec = resp_emb.data[0].embedding
-    node_id = f"HTML::{item['title']}"
-    # Add node with URL citation
-    G.add_node(
-      node_id,  text=full, embedding=vec, source=item['title'], url=item['url']
-    )
-# Run HTML processing
-for item in html_data:
-    process_html(item)
-# Save graph
-nx.write_gml(G, "graph.gml")
-print("Graph RAG database created: graph.gml")

+"""
+Refactored preprocessing pipeline for all RAG methods.
+Uses utils.py functions and supports multiple retrieval methods.
+Directory Layout:
+/data/         # Original PDFs, HTML
+/embeddings/   # FAISS, Chroma, DPR vector stores
+/graph/        # Graph database files
+/metadata/     # Image metadata (SQLite or MongoDB)
+"""
+import logging
+from pathlib import Path
+from config import *
+from utils import (
+    DocumentLoader, TextPreprocessor, VectorStoreManager,
+    ImageProcessor, ImageData
+)
+logger = logging.getLogger(__name__)
+# Ensure all directories exist
+ensure_directories()
+def preprocess_for_method(method: str, documents: list):
+    """Preprocess documents for a specific retrieval method."""
+    print(f"\n{'='*50}")
+    print(f"Preprocessing for method: {method}")
+    print(f"{'='*50}")
+    try:
+        # Initialize processors
+        text_processor = TextPreprocessor()
+        vector_manager = VectorStoreManager()
+        # Preprocess text chunks for this method
+        chunks = text_processor.preprocess_for_method(documents, method)
+        if method == 'vanilla':
+            # Build FAISS index with OpenAI embeddings
+            index, metadata = vector_manager.build_faiss_index(chunks, method="vanilla")
+            vector_manager.save_index(index, metadata, method)
+        elif method == 'dpr':
+            # Build FAISS index with sentence transformer embeddings
+            index, metadata = vector_manager.build_faiss_index(chunks, method="dpr")
+            vector_manager.save_index(index, metadata, method)
+        elif method == 'bm25':
+            # Build BM25 index
+            bm25_index = vector_manager.build_bm25_index(chunks)
+            vector_manager.save_index(bm25_index, chunks, method)
+        elif method == 'graph':
+            # Build NetworkX graph
+            graph = vector_manager.build_graph_index(chunks)
+            vector_manager.save_index(graph, None, method)
+        elif method == 'context_stuffing':
+            # Save full documents for context stuffing
+            vector_manager.save_index(None, chunks, method)
+        else:
+            raise ValueError(f"Unknown method: {method}")
+        print(f"Successfully preprocessed for method '{method}'")
+    except Exception as e:
+        logger.error(f"Error preprocessing for {method}: {e}")
+        raise
+def extract_and_process_images(documents: list):
+    """Extract images from documents and process them."""
+    print("\n" + "="*50)
+    print("Extracting and processing images...")
+    print("="*50)
+    image_processor = ImageProcessor()
+    processed_count = 0
+    filtered_count = 0
+    filter_reasons = {}
+    for doc in documents:
+        if 'images' in doc and doc['images']:
+            for image_info in doc['images']:
+                try:
+                    # Check if image should be filtered out
+                    should_filter, reason = image_processor.should_filter_image(image_info['image_path'])
+                    if should_filter:
+                        filtered_count += 1
+                        filter_reasons[reason] = filter_reasons.get(reason, 0) + 1
+                        print(f"  Filtered: {image_info['image_id']} - {reason}")
+                        # Optionally delete the filtered image file
+                        try:
+                            import os
+                            os.remove(image_info['image_path'])
+                            print(f"  Deleted: {image_info['image_path']}")
+                        except Exception as e:
+                            logger.warning(f"Could not delete filtered image {image_info['image_path']}: {e}")
+                        continue
+                    # Classify image
+                    classification = image_processor.classify_image(image_info['image_path'])
+                    # Generate embedding (placeholder for now)
+                    # embedding = embed_image_clip([image_info['image_path']])[0]
+                    # Create ImageData object
+                    image_data = ImageData(
+                        image_path=image_info['image_path'],
+                        image_id=image_info['image_id'],
+                        classification=classification,
+                        metadata={
+                            'source': doc['source'],
+                            'page': image_info.get('page'),
+                            'extracted_from': doc['path']
+                        }
+                    )
+                    # Store in database
+                    image_processor.store_image_metadata(image_data)
+                    processed_count += 1
+                except Exception as e:
+                    logger.error(f"Error processing image {image_info['image_id']}: {e}")
+                    continue
+    # Print filtering summary
+    if filtered_count > 0:
+        print(f"\nImage Filtering Summary:")
+        print(f"  Total filtered: {filtered_count}")
+        for reason, count in filter_reasons.items():
+            print(f"    {reason}: {count}")
+        print()
+    if processed_count > 0:
+        print(f"Processed and stored metadata for {processed_count} images")
+    else:
+        print("No images found in documents")
+def main():
+    """Main preprocessing pipeline."""
+    # Validate configuration
+    try:
+        validate_api_key()
+    except ValueError as e:
+        print(f"Error: {e}")
+        return
+    # Print configuration
+    print_config()
+    print("\nStarting preprocessing pipeline...")
+    # Load documents using utils
+    print("\nLoading documents...")
+    loader = DocumentLoader()
+    documents = loader.load_text_documents()
+    print(f"Loaded {len(documents)} documents")
+    # Define methods to preprocess
+    methods = ['vanilla', 'dpr', 'bm25', 'graph', 'context_stuffing']
+    # Preprocess for each method
+    for method in methods:
+        try:
+            preprocess_for_method(method, documents)
+        except Exception as e:
+            print(f"Error preprocessing for {method}: {e}")
+            import traceback
+            traceback.print_exc()
+    # Extract and process images
+    try:
+        extract_and_process_images(documents)
+    except Exception as e:
+        print(f"Error processing images: {e}")
+        import traceback
+        traceback.print_exc()
+    print("\n" + "="*50)
+    print("Preprocessing complete!")
+    print("="*50)
+if __name__ == "__main__":
+    main()

query_bm25.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+BM25 keyword search with cross-encoder re-ranking and hybrid search support.
+"""
+import numpy as np
+import faiss
+from typing import Tuple, List, Optional
+from openai import OpenAI
+from sentence_transformers import CrossEncoder
+import config
+import utils
+# Initialize models
+client = OpenAI(api_key=config.OPENAI_API_KEY)
+cross_encoder = CrossEncoder(config.CROSS_ENCODER_MODEL)
+# Global variables for lazy loading
+_bm25_index = None
+_texts = None
+_metadata = None
+_semantic_index = None
+def _load_bm25_index():
+    """Lazy load BM25 index and metadata."""
+    global _bm25_index, _texts, _metadata, _semantic_index
+    if _bm25_index is None:
+        # Initialize defaults
+        _texts = []
+        _metadata = []
+        _semantic_index = None
+        try:
+            import pickle
+            if config.BM25_INDEX.exists():
+                with open(config.BM25_INDEX, 'rb') as f:
+                    bm25_data = pickle.load(f)
+                if isinstance(bm25_data, dict):
+                    _bm25_index = bm25_data.get('index') or bm25_data.get('bm25')
+                    chunks = bm25_data.get('texts', [])
+                    if chunks:
+                        _texts = [chunk.text for chunk in chunks if hasattr(chunk, 'text')]
+                        _metadata = [chunk.metadata for chunk in chunks if hasattr(chunk, 'metadata')]
+                    else:
+                        _texts = []
+                        _metadata = []
+                    # Load semantic embeddings if available for hybrid search
+                    if 'embeddings' in bm25_data:
+                        semantic_embeddings = bm25_data['embeddings']
+                        # Build FAISS index
+                        import faiss
+                        dimension = semantic_embeddings.shape[1]
+                        _semantic_index = faiss.IndexFlatIP(dimension)
+                        faiss.normalize_L2(semantic_embeddings)
+                        _semantic_index.add(semantic_embeddings)
+                else:
+                    _bm25_index = bm25_data
+                    _texts = []
+                    _metadata = []
+                print(f"Loaded BM25 index with {len(_texts)} documents")
+            else:
+                print("BM25 index not found. Run preprocess.py first.")
+        except Exception as e:
+            print(f"Error loading BM25 index: {e}")
+            _bm25_index = None
+            _texts = []
+            _metadata = []
+def query(question: str, image_path: Optional[str] = None, top_k: int = None) -> Tuple[str, List[dict]]:
+    """
+    Query using BM25 keyword search with re-ranking.
+    Args:
+        question: User's question
+        image_path: Optional path to an image
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    if top_k is None:
+        top_k = config.DEFAULT_TOP_K
+    # Load index if not already loaded
+    _load_bm25_index()
+    if _bm25_index is None or len(_texts) == 0:
+        return "BM25 index not loaded. Please run preprocess.py first.", []
+    # Tokenize query for BM25
+    tokenized_query = question.lower().split()
+    # Get BM25 scores
+    bm25_scores = _bm25_index.get_scores(tokenized_query)
+    # Get top candidates (retrieve more for re-ranking)
+    top_indices = np.argsort(bm25_scores)[::-1][:top_k * config.RERANK_MULTIPLIER]
+    # Prepare candidates for re-ranking
+    candidates = []
+    for idx in top_indices:
+        if idx < len(_texts) and bm25_scores[idx] > 0:
+            candidates.append({
+                'text': _texts[idx],
+                'bm25_score': bm25_scores[idx],
+                'metadata': _metadata[idx],
+                'idx': idx
+            })
+    # Re-rank with cross-encoder
+    if candidates:
+        pairs = [[question, cand['text']] for cand in candidates]
+        cross_scores = cross_encoder.predict(pairs)
+        # Add cross-encoder scores and sort
+        for i, score in enumerate(cross_scores):
+            candidates[i]['cross_score'] = score
+        candidates = sorted(candidates, key=lambda x: x['cross_score'], reverse=True)[:top_k]
+    # Collect citations
+    citations = []
+    sources_seen = set()
+    for chunk in candidates:
+        chunk_meta = chunk['metadata']
+        if chunk_meta['source'] not in sources_seen:
+            citation = {
+                'source': chunk_meta['source'],
+                'type': chunk_meta['type'],
+                'bm25_score': round(chunk['bm25_score'], 3),
+                'rerank_score': round(chunk['cross_score'], 3)
+            }
+            if chunk_meta['type'] == 'pdf':
+                citation['path'] = chunk_meta['path']
+            else:
+                citation['url'] = chunk_meta.get('url', '')
+            citations.append(citation)
+            sources_seen.add(chunk_meta['source'])
+    # Handle image if provided
+    image_context = ""
+    if image_path:
+        try:
+            classification = utils.classify_image(image_path)
+            # classification is a string, not a dict
+            image_context = f"\n\n[Image Analysis: The image appears to show a {classification}.]"
+        except Exception as e:
+            print(f"Error processing image: {e}")
+    # Build context from retrieved chunks
+    context = "\n\n---\n\n".join([chunk['text'] for chunk in candidates])
+    if not context:
+        return "No relevant documents found for your query.", []
+    # Generate answer
+    prompt = f"""Answer the following question using the retrieved documents:
+Retrieved Documents:
+{context}{image_context}
+Question: {question}
+Instructions:
+1. Provide a comprehensive answer based on the retrieved documents
+2. Mention specific details from the sources
+3. If the documents don't fully answer the question, indicate what information is missing"""
+    # For GPT-5, temperature must be default (1.0)
+    response = client.chat.completions.create(
+        model=config.OPENAI_CHAT_MODEL,
+        messages=[
+            {"role": "system", "content": "You are a technical expert on manufacturing safety and regulations. Provide accurate, detailed answers based on the retrieved documents."},
+            {"role": "user", "content": prompt}
+        ],
+        max_completion_tokens=config.DEFAULT_MAX_TOKENS
+    )
+    answer = response.choices[0].message.content
+    return answer, citations
+def query_hybrid(question: str, top_k: int = None, alpha: float = None) -> Tuple[str, List[dict]]:
+    """
+    Hybrid search combining BM25 and semantic search.
+    Args:
+        question: User's question
+        top_k: Number of relevant chunks to retrieve
+        alpha: Weight for BM25 scores (1-alpha for semantic)
+    Returns:
+        Tuple of (answer, citations)
+    """
+    if top_k is None:
+        top_k = config.DEFAULT_TOP_K
+    if alpha is None:
+        alpha = config.DEFAULT_HYBRID_ALPHA
+    # Load index if not already loaded
+    _load_bm25_index()
+    if _bm25_index is None or _semantic_index is None:
+        return "Hybrid search requires both BM25 and semantic indices. Please run preprocess.py with semantic embeddings.", []
+    # Get BM25 scores
+    tokenized_query = question.lower().split()
+    bm25_scores = _bm25_index.get_scores(tokenized_query)
+    # Normalize BM25 scores
+    if bm25_scores.max() > 0:
+        bm25_scores = bm25_scores / bm25_scores.max()
+    # Get semantic scores using FAISS
+    embedding_generator = utils.EmbeddingGenerator()
+    query_embedding = embedding_generator.embed_text_openai([question]).astype(np.float32)
+    faiss.normalize_L2(query_embedding)
+    # Search semantic index for all documents
+    k_search = min(len(_texts), top_k * config.RERANK_MULTIPLIER)
+    distances, indices = _semantic_index.search(query_embedding.reshape(1, -1), k_search)
+    # Create semantic scores array
+    semantic_scores = np.zeros(len(_texts))
+    for idx, dist in zip(indices[0], distances[0]):
+        if idx < len(_texts):
+            semantic_scores[idx] = dist
+    # Combine scores
+    hybrid_scores = alpha * bm25_scores + (1 - alpha) * semantic_scores
+    # Get top candidates
+    top_indices = np.argsort(hybrid_scores)[::-1][:top_k * config.RERANK_MULTIPLIER]
+    # Prepare candidates
+    candidates = []
+    for idx in top_indices:
+        if idx < len(_texts) and hybrid_scores[idx] > 0:
+            candidates.append({
+                'text': _texts[idx],
+                'hybrid_score': hybrid_scores[idx],
+                'bm25_score': bm25_scores[idx],
+                'semantic_score': semantic_scores[idx],
+                'metadata': _metadata[idx],
+                'idx': idx
+            })
+    # Re-rank with cross-encoder
+    if candidates:
+        pairs = [[question, cand['text']] for cand in candidates]
+        cross_scores = cross_encoder.predict(pairs)
+        for i, score in enumerate(cross_scores):
+            candidates[i]['cross_score'] = score
+        # Final ranking using cross-encoder scores
+        candidates = sorted(candidates, key=lambda x: x['cross_score'], reverse=True)[:top_k]
+    # Collect citations
+    citations = []
+    sources_seen = set()
+    for chunk in candidates:
+        chunk_meta = chunk['metadata']
+        if chunk_meta['source'] not in sources_seen:
+            citation = {
+                'source': chunk_meta['source'],
+                'type': chunk_meta['type'],
+                'hybrid_score': round(chunk['hybrid_score'], 3),
+                'rerank_score': round(chunk.get('cross_score', 0), 3)
+            }
+            if chunk_meta['type'] == 'pdf':
+                citation['path'] = chunk_meta['path']
+            else:
+                citation['url'] = chunk_meta.get('url', '')
+            citations.append(citation)
+            sources_seen.add(chunk_meta['source'])
+    # Build context
+    context = "\n\n---\n\n".join([chunk['text'] for chunk in candidates])
+    if not context:
+        return "No relevant documents found for your query.", []
+    # Generate answer
+    prompt = f"""Using the following retrieved passages, answer the question:
+{context}
+Question: {question}
+Provide a clear, detailed answer based on the information in the passages."""
+    # For GPT-5, temperature must be default (1.0)
+    response = client.chat.completions.create(
+        model=config.OPENAI_CHAT_MODEL,
+        messages=[
+            {"role": "system", "content": "You are a safety expert. Answer questions accurately using the provided passages."},
+            {"role": "user", "content": prompt}
+        ],
+        max_completion_tokens=config.DEFAULT_MAX_TOKENS
+    )
+    answer = response.choices[0].message.content
+    return answer, citations
+if __name__ == "__main__":
+    # Test BM25 query
+    test_questions = [
+        "lockout tagout procedures",
+        "machine guard requirements OSHA",
+        "robot safety collaborative workspace"
+    ]
+    for q in test_questions:
+        print(f"\nQuestion: {q}")
+        answer, citations = query(q)
+        print(f"Answer: {answer[:200]}...")
+        print(f"Citations: {citations}")
+        print("-" * 50)

query_context.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""
+Context stuffing query module.
+Loads full documents and uses heuristics to select relevant content.
+"""
+import pickle
+import logging
+import re
+from typing import List, Tuple, Optional, Dict, Any
+from openai import OpenAI
+import tiktoken
+from config import *
+logger = logging.getLogger(__name__)
+class ContextStuffingRetriever:
+    """Context stuffing with heuristic document selection."""
+    def __init__(self):
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+        self.encoding = tiktoken.get_encoding("cl100k_base")
+        self.documents = None
+        self._load_documents()
+    def _load_documents(self):
+        """Load full documents for context stuffing."""
+        try:
+            if CONTEXT_DOCS.exists():
+                logger.info("Loading documents for context stuffing...")
+                with open(CONTEXT_DOCS, 'rb') as f:
+                    data = pickle.load(f)
+                if isinstance(data, list) and len(data) > 0:
+                    # Handle both old format (list of chunks) and new format (list of DocumentChunk objects)
+                    if hasattr(data[0], 'text'):  # New format with DocumentChunk objects
+                        self.documents = []
+                        for chunk in data:
+                            self.documents.append({
+                                'text': chunk.text,
+                                'metadata': chunk.metadata,
+                                'chunk_id': chunk.chunk_id
+                            })
+                    else:  # Old format with dict objects
+                        self.documents = data
+                    logger.info(f"✓ Loaded {len(self.documents)} documents for context stuffing")
+                else:
+                    logger.warning("No documents found in context stuffing file")
+                    self.documents = []
+            else:
+                logger.warning("Context stuffing documents not found. Run preprocess.py first.")
+                self.documents = []
+        except Exception as e:
+            logger.error(f"Error loading context stuffing documents: {e}")
+            self.documents = []
+    def _calculate_keyword_score(self, text: str, question: str) -> float:
+        """Calculate keyword overlap score between text and question."""
+        # Simple keyword matching heuristic
+        question_words = set(re.findall(r'\w+', question.lower()))
+        text_words = set(re.findall(r'\w+', text.lower()))
+        if not question_words:
+            return 0.0
+        overlap = len(question_words & text_words)
+        return overlap / len(question_words)
+    def _calculate_section_relevance(self, text: str, question: str) -> float:
+        """Calculate section relevance using multiple heuristics."""
+        score = 0.0
+        # Keyword overlap score (weight: 0.5)
+        keyword_score = self._calculate_keyword_score(text, question)
+        score += 0.5 * keyword_score
+        # Length penalty (prefer medium-length sections)
+        text_length = len(text.split())
+        optimal_length = 200  # words
+        length_score = min(1.0, text_length / optimal_length) if text_length < optimal_length else max(0.1, optimal_length / text_length)
+        score += 0.2 * length_score
+        # Header/title bonus (if text starts with common header patterns)
+        if re.match(r'^#+\s|^\d+\.\s|^[A-Z\s]{3,20}:', text.strip()):
+            score += 0.1
+        # Question type specific bonuses
+        question_lower = question.lower()
+        text_lower = text.lower()
+        if any(word in question_lower for word in ['what', 'define', 'definition']):
+            if any(phrase in text_lower for phrase in ['means', 'defined as', 'definition', 'refers to']):
+                score += 0.2
+        if any(word in question_lower for word in ['how', 'procedure', 'steps']):
+            if any(phrase in text_lower for phrase in ['step', 'procedure', 'process', 'method']):
+                score += 0.2
+        if any(word in question_lower for word in ['requirement', 'shall', 'must']):
+            if any(phrase in text_lower for phrase in ['shall', 'must', 'required', 'requirement']):
+                score += 0.2
+        return min(1.0, score)  # Cap at 1.0
+    def select_relevant_documents(self, question: str, max_tokens: int = None) -> List[Dict[str, Any]]:
+        """Select most relevant documents using heuristics."""
+        if not self.documents:
+            return []
+        if max_tokens is None:
+            max_tokens = MAX_CONTEXT_TOKENS
+        # Score all documents
+        scored_docs = []
+        for doc in self.documents:
+            text = doc.get('text', '')
+            if text.strip():
+                relevance_score = self._calculate_section_relevance(text, question)
+                doc_info = {
+                    'text': text,
+                    'metadata': doc.get('metadata', {}),
+                    'score': relevance_score,
+                    'token_count': len(self.encoding.encode(text))
+                }
+                scored_docs.append(doc_info)
+        # Sort by relevance score
+        scored_docs.sort(key=lambda x: x['score'], reverse=True)
+        # Select documents within token limit
+        selected_docs = []
+        total_tokens = 0
+        for doc in scored_docs:
+            if doc['score'] > 0.1:  # Minimum relevance threshold
+                if total_tokens + doc['token_count'] <= max_tokens:
+                    selected_docs.append(doc)
+                    total_tokens += doc['token_count']
+                else:
+                    # Try to include a truncated version
+                    remaining_tokens = max_tokens - total_tokens
+                    if remaining_tokens > 100:  # Only if meaningful content can fit
+                        truncated_text = self._truncate_text(doc['text'], remaining_tokens)
+                        if truncated_text:
+                            doc['text'] = truncated_text
+                            doc['token_count'] = len(self.encoding.encode(truncated_text))
+                            selected_docs.append(doc)
+                    break
+        logger.info(f"Selected {len(selected_docs)} documents with {total_tokens} total tokens")
+        return selected_docs
+    def _truncate_text(self, text: str, max_tokens: int) -> str:
+        """Truncate text to fit within token limit while preserving meaning."""
+        tokens = self.encoding.encode(text)
+        if len(tokens) <= max_tokens:
+            return text
+        # Truncate and try to end at a sentence boundary
+        truncated_tokens = tokens[:max_tokens]
+        truncated_text = self.encoding.decode(truncated_tokens)
+        # Try to end at a sentence boundary
+        sentences = re.split(r'[.!?]+', truncated_text)
+        if len(sentences) > 1:
+            # Remove the last incomplete sentence
+            truncated_text = '.'.join(sentences[:-1]) + '.'
+        return truncated_text
+    def generate_answer(self, question: str, context_docs: List[Dict[str, Any]]) -> str:
+        """Generate answer using full context stuffing approach."""
+        if not context_docs:
+            return "I couldn't find any relevant documents to answer your question."
+        try:
+            # Assemble context from selected documents
+            context_parts = []
+            sources = []
+            for i, doc in enumerate(context_docs, 1):
+                text = doc['text']
+                metadata = doc['metadata']
+                source = metadata.get('source', f'Document {i}')
+                context_parts.append(f"=== {source} ===\n{text}")
+                if source not in sources:
+                    sources.append(source)
+            full_context = "\n\n".join(context_parts)
+            # Create system message for context stuffing
+            system_message = (
+                "You are an expert in occupational safety and health regulations. "
+                "Answer the user's question using the provided regulatory documents and technical materials. "
+                "Provide comprehensive, accurate answers that directly address the question. "
+                "Reference specific sections or requirements when applicable. "
+                "If the provided context doesn't fully answer the question, clearly state what information is missing."
+            )
+            # Create user message
+            user_message = f"""Based on the following regulatory and technical documents, please answer this question:
+QUESTION: {question}
+DOCUMENTS:
+{full_context}
+Please provide a thorough answer based on the information in these documents. If any important details are missing from the provided context, please indicate that as well."""
+            # For GPT-5, temperature must be default (1.0)
+            response = self.client.chat.completions.create(
+                model=OPENAI_CHAT_MODEL,
+                messages=[
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": user_message}
+                ],
+                max_completion_tokens=DEFAULT_MAX_TOKENS
+            )
+            answer = response.choices[0].message.content.strip()
+            # Add source information
+            if len(sources) > 1:
+                answer += f"\n\n*Sources consulted: {', '.join(sources)}*"
+            elif sources:
+                answer += f"\n\n*Source: {sources[0]}*"
+            return answer
+        except Exception as e:
+            logger.error(f"Error generating context stuffing answer: {e}")
+            return "I apologize, but I encountered an error while generating the answer using context stuffing."
+# Global retriever instance
+_retriever = None
+def get_retriever() -> ContextStuffingRetriever:
+    """Get or create global context stuffing retriever instance."""
+    global _retriever
+    if _retriever is None:
+        _retriever = ContextStuffingRetriever()
+    return _retriever
+def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict]]:
+    """
+    Main context stuffing query function with unified signature.
+    Args:
+        question: User question
+        image_path: Optional image path (not used in context stuffing but kept for consistency)
+        top_k: Not used in context stuffing (uses heuristic selection instead)
+    Returns:
+        Tuple of (answer, citations)
+    """
+    try:
+        retriever = get_retriever()
+        # Select relevant documents using heuristics
+        relevant_docs = retriever.select_relevant_documents(question)
+        if not relevant_docs:
+            return "I couldn't find any relevant documents to answer your question.", []
+        # Generate comprehensive answer
+        answer = retriever.generate_answer(question, relevant_docs)
+        # Prepare citations
+        citations = []
+        for i, doc in enumerate(relevant_docs, 1):
+            metadata = doc['metadata']
+            citations.append({
+                'rank': i,
+                'score': float(doc['score']),
+                'source': metadata.get('source', 'Unknown'),
+                'type': metadata.get('type', 'unknown'),
+                'method': 'context_stuffing',
+                'tokens_used': doc['token_count']
+            })
+        logger.info(f"Context stuffing query completed. Used {len(citations)} documents.")
+        return answer, citations
+    except Exception as e:
+        logger.error(f"Error in context stuffing query: {e}")
+        error_message = "I apologize, but I encountered an error while processing your question with context stuffing."
+        return error_message, []
+def query_with_details(question: str, image_path: Optional[str] = None,
+                      top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict], List[Tuple]]:
+    """
+    Context stuffing query function that returns detailed chunk information (for compatibility).
+    Returns:
+        Tuple of (answer, citations, chunks)
+    """
+    answer, citations = query(question, image_path, top_k)
+    # Convert citations to chunk format for backward compatibility
+    chunks = []
+    for citation in citations:
+        chunks.append((
+            f"Document {citation['rank']} (Score: {citation['score']:.3f})",
+            citation['score'],
+            f"Context from {citation['source']} ({citation['tokens_used']} tokens)",
+            citation['source']
+        ))
+    return answer, citations, chunks
+if __name__ == "__main__":
+    # Test the context stuffing system
+    test_question = "What are the general requirements for machine guarding?"
+    print("Testing context stuffing retrieval system...")
+    print(f"Question: {test_question}")
+    print("-" * 50)
+    try:
+        answer, citations = query(test_question)
+        print("Answer:")
+        print(answer)
+        print(f"\nCitations ({len(citations)} documents used):")
+        for citation in citations:
+            print(f"- {citation['source']} (Relevance: {citation['score']:.3f}, Tokens: {citation['tokens_used']})")
+    except Exception as e:
+        print(f"Error during testing: {e}")

query_dpr.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Dense Passage Retrieval (DPR) query module.
+Uses bi-encoder for retrieval and cross-encoder for re-ranking.
+"""
+import pickle
+import logging
+from typing import List, Tuple, Optional
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from openai import OpenAI
+from config import *
+logger = logging.getLogger(__name__)
+class DPRRetriever:
+    """Dense Passage Retrieval with cross-encoder re-ranking."""
+    def __init__(self):
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+        self.bi_encoder = None
+        self.cross_encoder = None
+        self.index = None
+        self.metadata = None
+        self._load_models()
+        self._load_index()
+    def _load_models(self):
+        """Load bi-encoder and cross-encoder models."""
+        try:
+            logger.info("Loading DPR models...")
+            self.bi_encoder = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
+            self.cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
+            if DEVICE == "cuda":
+                self.bi_encoder = self.bi_encoder.to(DEVICE)
+                self.cross_encoder = self.cross_encoder.to(DEVICE)
+            logger.info("✓ DPR models loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading DPR models: {e}")
+            raise
+    def _load_index(self):
+        """Load FAISS index and metadata."""
+        try:
+            if DPR_FAISS_INDEX.exists() and DPR_METADATA.exists():
+                logger.info("Loading DPR index and metadata...")
+                # Load FAISS index
+                self.index = faiss.read_index(str(DPR_FAISS_INDEX))
+                # Load metadata
+                with open(DPR_METADATA, 'rb') as f:
+                    data = pickle.load(f)
+                    self.metadata = data
+                logger.info(f"✓ Loaded DPR index with {len(self.metadata)} chunks")
+            else:
+                logger.warning("DPR index not found. Run preprocess.py first.")
+        except Exception as e:
+            logger.error(f"Error loading DPR index: {e}")
+            raise
+    def retrieve_candidates(self, question: str, top_k: int = DEFAULT_TOP_K) -> List[Tuple[str, float, dict]]:
+        """Retrieve candidate passages using bi-encoder."""
+        if self.index is None or self.metadata is None:
+            raise ValueError("DPR index not loaded. Run preprocess.py first.")
+        try:
+            # Encode question with bi-encoder
+            question_embedding = self.bi_encoder.encode([question], convert_to_numpy=True)
+            # Normalize for cosine similarity
+            faiss.normalize_L2(question_embedding)
+            # Search FAISS index
+            # Retrieve more candidates for re-ranking
+            retrieve_k = min(top_k * RERANK_MULTIPLIER, len(self.metadata))
+            scores, indices = self.index.search(question_embedding, retrieve_k)
+            # Prepare candidates
+            candidates = []
+            for score, idx in zip(scores[0], indices[0]):
+                if idx < len(self.metadata):
+                    chunk_data = self.metadata[idx]
+                    candidates.append((
+                        chunk_data['text'],
+                        float(score),
+                        chunk_data['metadata']
+                    ))
+            logger.info(f"Retrieved {len(candidates)} candidates for re-ranking")
+            return candidates
+        except Exception as e:
+            logger.error(f"Error in candidate retrieval: {e}")
+            raise
+    def rerank_candidates(self, question: str, candidates: List[Tuple[str, float, dict]],
+                         top_k: int = DEFAULT_TOP_K) -> List[Tuple[str, float, dict]]:
+        """Re-rank candidates using cross-encoder."""
+        if not candidates:
+            return []
+        try:
+            # Prepare pairs for cross-encoder
+            pairs = [(question, candidate[0]) for candidate in candidates]
+            # Get cross-encoder scores
+            cross_scores = self.cross_encoder.predict(pairs)
+            # Combine with candidate data and re-sort
+            reranked = []
+            for i, (text, bi_score, metadata) in enumerate(candidates):
+                cross_score = float(cross_scores[i])
+                # Filter by minimum relevance score
+                if cross_score >= MIN_RELEVANCE_SCORE:
+                    reranked.append((text, cross_score, metadata))
+            # Sort by cross-encoder score (descending)
+            reranked.sort(key=lambda x: x[1], reverse=True)
+            # Return top-k
+            final_results = reranked[:top_k]
+            logger.info(f"Re-ranked to {len(final_results)} final results")
+            return final_results
+        except Exception as e:
+            logger.error(f"Error in re-ranking: {e}")
+            # Fall back to bi-encoder results
+            return candidates[:top_k]
+    def generate_answer(self, question: str, context_chunks: List[Tuple[str, float, dict]]) -> str:
+        """Generate answer using GPT with retrieved context."""
+        if not context_chunks:
+            return "I couldn't find relevant information to answer your question."
+        try:
+            # Prepare context
+            context_parts = []
+            for i, (text, score, metadata) in enumerate(context_chunks, 1):
+                source = metadata.get('source', 'Unknown')
+                context_parts.append(f"[Context {i}] Source: {source}\n{text}")
+            context = "\n\n".join(context_parts)
+            # Create system message
+            system_message = (
+                "You are a helpful assistant specialized in occupational safety and health. "
+                "Answer questions based only on the provided context. "
+                "If the context doesn't contain enough information, say so clearly. "
+                "Always cite the source when referencing information."
+            )
+            # Create user message
+            user_message = f"Context:\n{context}\n\nQuestion: {question}"
+            # Generate response
+            # For GPT-5, temperature must be default (1.0)
+            response = self.client.chat.completions.create(
+                model=OPENAI_CHAT_MODEL,
+                messages=[
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": user_message}
+                ],
+                max_completion_tokens=DEFAULT_MAX_TOKENS
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            logger.error(f"Error generating answer: {e}")
+            return "I apologize, but I encountered an error while generating the answer."
+# Global retriever instance
+_retriever = None
+def get_retriever() -> DPRRetriever:
+    """Get or create global DPR retriever instance."""
+    global _retriever
+    if _retriever is None:
+        _retriever = DPRRetriever()
+    return _retriever
+def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[dict]]:
+    """
+    Main DPR query function with unified signature.
+    Args:
+        question: User question
+        image_path: Optional image path (not used in DPR but kept for consistency)
+        top_k: Number of top results to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    try:
+        retriever = get_retriever()
+        # Step 1: Retrieve candidates with bi-encoder
+        candidates = retriever.retrieve_candidates(question, top_k)
+        if not candidates:
+            return "I couldn't find any relevant information for your question.", []
+        # Step 2: Re-rank with cross-encoder
+        reranked_candidates = retriever.rerank_candidates(question, candidates, top_k)
+        # Step 3: Generate answer
+        answer = retriever.generate_answer(question, reranked_candidates)
+        # Step 4: Prepare citations
+        citations = []
+        for i, (text, score, metadata) in enumerate(reranked_candidates, 1):
+            citations.append({
+                'rank': i,
+                'text': text,
+                'score': float(score),
+                'source': metadata.get('source', 'Unknown'),
+                'type': metadata.get('type', 'unknown'),
+                'method': 'dpr'
+            })
+        logger.info(f"DPR query completed. Retrieved {len(citations)} citations.")
+        return answer, citations
+    except Exception as e:
+        logger.error(f"Error in DPR query: {e}")
+        error_message = "I apologize, but I encountered an error while processing your question with DPR."
+        return error_message, []
+def query_with_details(question: str, image_path: Optional[str] = None,
+                      top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[dict], List[Tuple]]:
+    """
+    DPR query function that returns detailed chunk information (for compatibility).
+    Returns:
+        Tuple of (answer, citations, chunks)
+    """
+    answer, citations = query(question, image_path, top_k)
+    # Convert citations to chunk format for backward compatibility
+    chunks = []
+    for citation in citations:
+        chunks.append((
+            f"Rank {citation['rank']} (Score: {citation['score']:.3f})",
+            citation['score'],
+            citation['text'],
+            citation['source']
+        ))
+    return answer, citations, chunks
+if __name__ == "__main__":
+    # Test the DPR system
+    test_question = "What are the general requirements for machine guarding?"
+    print("Testing DPR retrieval system...")
+    print(f"Question: {test_question}")
+    print("-" * 50)
+    try:
+        answer, citations = query(test_question)
+        print("Answer:")
+        print(answer)
+        print("\nCitations:")
+        for citation in citations:
+            print(f"- {citation['source']} (Score: {citation['score']:.3f})")
+    except Exception as e:
+        print(f"Error during testing: {e}")

query_graph.py CHANGED Viewed

@@ -1,140 +1,413 @@
-import os
 import numpy as np
-from dotenv import load_dotenv
 from openai import OpenAI
 import networkx as nx
 from sklearn.metrics.pairwise import cosine_similarity
 # Initialize OpenAI client
-load_dotenv(override=True)
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-# Load graph from GML
-G = nx.read_gml("graph.gml")
-enodes = list(G.nodes)
-embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
-def query_graph(question, top_k=5):
     """
-    Embed the question, retrieve the top_k relevant chunks,
-    and return: (answer, sources, chunks)
-      - answer: generated response string
-      - sources: list of unique source names
-      - chunks: list of tuples (header, score, full_text, source_url_or_path)
     """
-    # Embed question
     emb_resp = client.embeddings.create(
-        model="text-embedding-3-large",
         input=question
     )
-    q_vec = emb_resp.data[0].embedding
     # Compute cosine similarities
-    sims = cosine_similarity([q_vec], embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
     # Collect chunk-level info
     chunks = []
-    sources = []
     for rank, i in enumerate(idxs, start=1):
-        node = enodes[i]
-        text = G.nodes[node]['text']
-        header = text.split('\n', 1)[0].lstrip('# ').strip()
         score = sims[i]
-        # Determine citation (URL for HTML, path for PDF)
-        citation = G.nodes[node].get('url') or G.nodes[node].get('path') or G.nodes[node]['source']
-        chunks.append((header, score, text, citation))
-        sources.append(G.nodes[node]['source'])
-    # Deduplicate sources
-    sources = list(dict.fromkeys(sources))
-    # Assemble prompt
-    context = "\n\n---\n\n".join([c[2] for c in chunks])
-    prompt = (
-        "Use the following context to answer the question:\n\n" +
-        context +
-        f"\n\nQuestion: {question}\nAnswer:"
-    )
-    # Query chat model
     chat_resp = client.chat.completions.create(
-        model="gpt-4o-mini",
         messages=[
-            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
-            {"role": "user",   "content": prompt}
-        ]
     )
     answer = chat_resp.choices[0].message.content
-    return answer, sources, chunks
     """
-    Embed the user question, retrieve the top_k relevant chunks from the graph,
-    assemble a prompt with those chunks, call the chat model, and return:
-      - answer: the generated response
-      - sources: unique list of source documents
-      - chunks: list of (header, score, full_text) for the top_k passages
     """
-    # Embed the question
     emb_resp = client.embeddings.create(
-        model="text-embedding-3-large",
         input=question
     )
-    q_vec = emb_resp.data[0].embedding
-    # Compute similarities against all stored embeddings
-    sims = cosine_similarity([q_vec], embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
-    # Gather chunk‑level info and sources
     chunks = []
-    sources = []
     for i in idxs:
-        node = enodes[i]
-        text = G.nodes[node]['text']
-        # Use the first line as the header
-        header = text.split('\n', 1)[0].lstrip('# ').strip()
-        score = sims[i]
-        chunks.append((header, score, text))
-        sources.append(G.nodes[node]['source'])
-    # Deduplicate sources while preserving order
-    sources = list(dict.fromkeys(sources))
-    # Assemble the prompt from the chunk texts
-    context_text = "\n\n---\n\n".join([chunk[2] for chunk in chunks])
-    prompt = (
-        "Use the following context to answer the question:\n\n"
-        + context_text
-        + f"\n\nQuestion: {question}\nAnswer:"
-    )
-    # Call the chat model
-    chat_resp = client.chat.completions.create(
-        model="gpt-4o-mini",
         messages=[
-            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
-            {"role": "user",   "content": prompt}
-        ]
     )
-    answer = chat_resp.choices[0].message.content
     return answer, sources, chunks
-# Test queries
-# test_questions = [
-#     "What are general machine guarding requirements?",
-#     "Explain the key steps in lockout/tagout procedures."
-# ]
-# for q in test_questions:
-#     answer, sources, chunks = query_graph(q)
-#     print(f"Q: {q}")
-#     print(f"Answer: {answer}\n")
-#     print("Sources:")
-#     for src in sources:
-#         print(f"- {src}")
-#     print("\nTop Chunks:")
-#     for header, score, _, citation in chunks:
-#         print(f"  * {header} (score: {score:.2f}) from {citation}")
-#     print("\n", "#"*40, "\n")

+"""
+Graph-based RAG using NetworkX.
+Updated to match the common query signature used by other methods.
+"""
 import numpy as np
+import logging
+from typing import Tuple, List, Optional
 from openai import OpenAI
 import networkx as nx
 from sklearn.metrics.pairwise import cosine_similarity
+from config import *
+from utils import classify_image
+logger = logging.getLogger(__name__)
 # Initialize OpenAI client
+client = OpenAI(api_key=OPENAI_API_KEY)
+# Global variables for lazy loading
+_graph = None
+_enodes = None
+_embeddings = None
+def _load_graph():
+    """Lazy load graph database."""
+    global _graph, _enodes, _embeddings
+    if _graph is None:
+        try:
+            if GRAPH_FILE.exists():
+                logger.info("Loading graph database...")
+                _graph = nx.read_gml(str(GRAPH_FILE))
+                _enodes = list(_graph.nodes)
+                # Convert embeddings from lists back to numpy arrays
+                embeddings_list = []
+                for n in _enodes:
+                    embedding = _graph.nodes[n]['embedding']
+                    if isinstance(embedding, list):
+                        embeddings_list.append(np.array(embedding))
+                    else:
+                        embeddings_list.append(embedding)
+                _embeddings = np.array(embeddings_list)
+                logger.info(f"✓ Loaded graph with {len(_enodes)} nodes")
+            else:
+                logger.warning("Graph database not found. Run preprocess.py first.")
+                _graph = nx.Graph()
+                _enodes = []
+                _embeddings = np.array([])
+        except Exception as e:
+            logger.error(f"Error loading graph: {e}")
+            _graph = nx.Graph()
+            _enodes = []
+            _embeddings = np.array([])
+def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[dict]]:
     """
+    Query using graph-based retrieval.
+    Args:
+        question: User's question
+        image_path: Optional path to an image (for multimodal queries)
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, citations)
     """
+    # Load graph if not already loaded
+    _load_graph()
+    if len(_enodes) == 0:
+        return "Graph database is empty. Please run preprocess.py first.", []
+    # Embed question using OpenAI
     emb_resp = client.embeddings.create(
+        model=OPENAI_EMBEDDING_MODEL,
         input=question
     )
+    q_vec = np.array(emb_resp.data[0].embedding)
     # Compute cosine similarities
+    sims = cosine_similarity([q_vec], _embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
     # Collect chunk-level info
     chunks = []
+    citations = []
+    sources_seen = set()
     for rank, i in enumerate(idxs, start=1):
+        node = _enodes[i]
+        node_data = _graph.nodes[node]
+        text = node_data['text']
+        # Extract header from text
+        header = text.split('\n', 1)[0].lstrip('#').strip()
         score = sims[i]
+        # Extract citation format - get source from metadata or node_data
+        metadata = node_data.get('metadata', {})
+        source = metadata.get('source') or node_data.get('source')
+        if not source:
+            continue
+        if 'url' in metadata:  # HTML source
+            citation_ref = metadata['url']
+            cite_type = 'html'
+        elif 'path' in metadata:  # PDF source
+            citation_ref = metadata['path']
+            cite_type = 'pdf'
+        elif 'url' in node_data:  # Legacy format
+            citation_ref = node_data['url']
+            cite_type = 'html'
+        elif 'path' in node_data:  # Legacy format
+            citation_ref = node_data['path']
+            cite_type = 'pdf'
+        else:
+            citation_ref = source
+            cite_type = 'unknown'
+        chunks.append({
+            'header': header,
+            'score': score,
+            'text': text,
+            'citation': citation_ref
+        })
+        # Add unique citation
+        if source not in sources_seen:
+            citation_entry = {
+                'source': source,
+                'type': cite_type,
+                'relevance_score': round(float(score), 3)
+            }
+            if cite_type == 'html':
+                citation_entry['url'] = citation_ref
+            elif cite_type == 'pdf':
+                citation_entry['path'] = citation_ref
+            citations.append(citation_entry)
+            sources_seen.add(source)
+    # Handle image if provided
+    image_context = ""
+    if image_path:
+        try:
+            # Classify the image
+            classification = classify_image(image_path)
+            image_context = f"\n\n[Image Context: The provided image appears to be a {classification}.]"
+            # Optionally, find related nodes in graph based on image classification
+            # This would require storing image-related metadata in the graph
+        except Exception as e:
+            print(f"Error processing image: {e}")
+    # Assemble context for prompt
+    context = "\n\n---\n\n".join([c['text'] for c in chunks])
+    prompt = f"""Use the following context to answer the question:
+{context}{image_context}
+Question: {question}
+Please provide a comprehensive answer based on the context provided. Cite specific sources when providing information."""
+    # For GPT-5, temperature must be default (1.0)
     chat_resp = client.chat.completions.create(
+        model=OPENAI_CHAT_MODEL,
         messages=[
+            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety. Always provide accurate information based on the given context."},
+            {"role": "user", "content": prompt}
+        ],
+        max_completion_tokens=DEFAULT_MAX_TOKENS
     )
     answer = chat_resp.choices[0].message.content
+    return answer, citations
+def query_with_graph_traversal(question: str, top_k: int = 5, max_hops: int = 2) -> Tuple[str, List[dict]]:
     """
+    Enhanced graph query that can traverse edges to find related information.
+    Args:
+        question: User's question
+        top_k: Number of initial nodes to retrieve
+        max_hops: Maximum graph traversal depth
+    Returns:
+        Tuple of (answer, citations)
     """
+    # Load graph if not already loaded
+    _load_graph()
+    if len(_enodes) == 0:
+        return "Graph database is empty. Please run preprocess.py first.", []
+    # Get initial nodes using standard query
+    initial_answer, initial_citations = query(question, top_k=top_k)
+    # For a more sophisticated implementation, you would:
+    # 1. Add edges between related nodes during preprocessing
+    # 2. Traverse from initial nodes to find related content
+    # 3. Score the related nodes based on path distance and relevance
+    # For now, return the standard query results
+    return initial_answer, initial_citations
+def query_subgraph(question: str, source_filter: str = None, top_k: int = 5) -> Tuple[str, List[dict]]:
+    """
+    Query a specific subgraph filtered by source.
+    Args:
+        question: User's question
+        source_filter: Filter nodes by source (e.g., specific PDF name)
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    # Load graph if not already loaded
+    _load_graph()
+    # Filter nodes if source specified
+    if source_filter:
+        filtered_nodes = []
+        for n in _enodes:
+            node_data = _graph.nodes[n]
+            metadata = node_data.get('metadata', {})
+            source = metadata.get('source') or node_data.get('source', '')
+            source_from_meta = metadata.get('source', '')
+            # Check both direct source and metadata source
+            if (source_filter.lower() in source.lower() or
+                source_filter.lower() in source_from_meta.lower()):
+                filtered_nodes.append(n)
+        if not filtered_nodes:
+            return f"No nodes found for source: {source_filter}", []
+    else:
+        filtered_nodes = _enodes
+    # Get embeddings for filtered nodes
+    filtered_embeddings = np.array([_graph.nodes[n]['embedding'] for n in filtered_nodes])
+    # Embed question
     emb_resp = client.embeddings.create(
+        model=OPENAI_EMBEDDING_MODEL,
         input=question
     )
+    q_vec = np.array(emb_resp.data[0].embedding)
+    # Compute similarities
+    sims = cosine_similarity([q_vec], filtered_embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
+    # Collect results
     chunks = []
+    citations = []
+    sources_seen = set()
     for i in idxs:
+        if i < len(filtered_nodes):
+            node = filtered_nodes[i]
+            node_data = _graph.nodes[node]
+            chunks.append(node_data['text'])
+            # Skip if source information missing
+            metadata = node_data.get('metadata', {})
+            source = metadata.get('source') or node_data.get('source')
+            if not source:
+                continue
+            if source not in sources_seen:
+                citation = {
+                    'source': source,
+                    'type': 'pdf' if ('path' in metadata or 'path' in node_data) else 'html',
+                    'relevance_score': round(float(sims[i]), 3)
+                }
+                # Check metadata first, then node_data for legacy support
+                if 'url' in metadata:
+                    citation['url'] = metadata['url']
+                elif 'path' in metadata:
+                    citation['path'] = metadata['path']
+                elif 'url' in node_data:
+                    citation['url'] = node_data['url']
+                elif 'path' in node_data:
+                    citation['path'] = node_data['path']
+                citations.append(citation)
+                sources_seen.add(source)
+    # Build context and generate answer
+    context = "\n\n---\n\n".join(chunks)
+    prompt = f"""Answer the following question using the provided context:
+Context from {source_filter if source_filter else 'all sources'}:
+{context}
+Question: {question}
+Provide a detailed answer based on the context."""
+    # For GPT-5, temperature must be default (1.0)
+    response = client.chat.completions.create(
+        model=OPENAI_CHAT_MODEL,
         messages=[
+            {"role": "system", "content": "You are an expert on manufacturing safety. Answer based on the provided context."},
+            {"role": "user", "content": prompt}
+        ],
+        max_completion_tokens=DEFAULT_MAX_TOKENS
     )
+    answer = response.choices[0].message.content
+    return answer, citations
+# Maintain backward compatibility with original function signature
+def query_graph(question: str, top_k: int = 5) -> Tuple[str, List[str], List[tuple]]:
+    """
+    Original query_graph function signature for backward compatibility.
+    Args:
+        question: User's question
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, sources, chunks)
+    """
+    # Call the new query function
+    answer, citations = query(question, top_k=top_k)
+    # Convert citations to old format
+    sources = [c['source'] for c in citations]
+    # Get chunks in old format (header, score, text, citation)
+    _load_graph()
+    if len(_enodes) == 0:
+        return answer, sources, []
+    # Regenerate chunks for backward compatibility
+    emb_resp = client.embeddings.create(
+        model=OPENAI_EMBEDDING_MODEL,
+        input=question
+    )
+    q_vec = np.array(emb_resp.data[0].embedding)
+    sims = cosine_similarity([q_vec], _embeddings)[0]
+    idxs = sims.argsort()[::-1][:top_k]
+    chunks = []
+    for i in idxs:
+        node = _enodes[i]
+        node_data = _graph.nodes[node]
+        text = node_data['text']
+        header = text.split('\n', 1)[0].lstrip('#').strip()
+        score = sims[i]
+        # Skip if source information missing
+        metadata = node_data.get('metadata', {})
+        source = metadata.get('source') or node_data.get('source')
+        if not source:
+            continue
+        if 'url' in metadata:
+            citation = metadata['url']
+        elif 'path' in metadata:
+            citation = metadata['path']
+        elif 'url' in node_data:
+            citation = node_data['url']
+        elif 'path' in node_data:
+            citation = node_data['path']
+        else:
+            citation = source
+        chunks.append((header, score, text, citation))
     return answer, sources, chunks
+if __name__ == "__main__":
+    # Test the updated graph query
+    test_questions = [
+        "What are general machine guarding requirements?",
+        "How do I perform lockout/tagout procedures?",
+        "What safety measures are needed for robotic systems?"
+    ]
+    for q in test_questions:
+        print(f"\nQuestion: {q}")
+        answer, citations = query(q)
+        print(f"Answer: {answer[:200]}...")
+        print(f"Citations: {[c['source'] for c in citations]}")
+        print("-" * 50)

query_vanilla.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Vanilla vector search using FAISS index and OpenAI embeddings.
+"""
+import numpy as np
+import faiss
+from typing import Tuple, List, Optional
+from openai import OpenAI
+import pickle
+import logging
+from config import *
+from utils import EmbeddingGenerator, classify_image
+logger = logging.getLogger(__name__)
+# Initialize OpenAI client
+client = OpenAI(api_key=OPENAI_API_KEY)
+# Global variables for lazy loading
+_index = None
+_texts = None
+_metadata = None
+def _load_vanilla_index():
+    """Lazy load vanilla FAISS index and metadata."""
+    global _index, _texts, _metadata
+    if _index is None:
+        try:
+            if VANILLA_FAISS_INDEX.exists() and VANILLA_METADATA.exists():
+                logger.info("Loading vanilla FAISS index...")
+                # Load FAISS index
+                _index = faiss.read_index(str(VANILLA_FAISS_INDEX))
+                # Load metadata
+                with open(VANILLA_METADATA, 'rb') as f:
+                    data = pickle.load(f)
+                if isinstance(data, list):
+                    # New format with metadata list
+                    _texts = [item['text'] for item in data]
+                    _metadata = [item['metadata'] for item in data]
+                else:
+                    # Old format with dict
+                    _texts = data.get('texts', [])
+                    _metadata = data.get('metadata', [])
+                logger.info(f"✓ Loaded vanilla index with {len(_texts)} documents")
+            else:
+                logger.warning("Vanilla index not found. Run preprocess.py first.")
+                _index = None
+                _texts = []
+                _metadata = []
+        except Exception as e:
+            logger.error(f"Error loading vanilla index: {e}")
+            _index = None
+            _texts = []
+            _metadata = []
+def query(question: str, image_path: Optional[str] = None, top_k: int = None) -> Tuple[str, List[dict]]:
+    """
+    Query using vanilla vector search.
+    Args:
+        question: User's question
+        image_path: Optional path to an image (for multimodal queries)
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    if top_k is None:
+        top_k = DEFAULT_TOP_K
+    # Load index if not already loaded
+    _load_vanilla_index()
+    if _index is None or len(_texts) == 0:
+        return "Index not loaded. Please run preprocess.py first.", []
+    # Generate query embedding using embedding generator
+    embedding_gen = EmbeddingGenerator()
+    query_embedding = embedding_gen.embed_text_openai([question])
+    # Normalize for cosine similarity
+    query_embedding = query_embedding.astype(np.float32)
+    faiss.normalize_L2(query_embedding)
+    # Search the index
+    distances, indices = _index.search(query_embedding, top_k)
+    # Collect retrieved chunks and citations
+    retrieved_chunks = []
+    citations = []
+    sources_seen = set()
+    for idx, distance in zip(indices[0], distances[0]):
+        if idx < len(_texts) and distance > MIN_RELEVANCE_SCORE:
+            chunk_text = _texts[idx]
+            chunk_meta = _metadata[idx]
+            retrieved_chunks.append({
+                'text': chunk_text,
+                'score': float(distance),
+                'metadata': chunk_meta
+            })
+            # Build citation
+            if chunk_meta['source'] not in sources_seen:
+                citation = {
+                    'source': chunk_meta['source'],
+                    'type': chunk_meta['type'],
+                    'relevance_score': round(float(distance), 3)
+                }
+                if chunk_meta['type'] == 'pdf':
+                    citation['path'] = chunk_meta['path']
+                else:  # HTML
+                    citation['url'] = chunk_meta.get('url', '')
+                citations.append(citation)
+                sources_seen.add(chunk_meta['source'])
+    # Handle image if provided
+    image_context = ""
+    if image_path:
+        try:
+            classification = classify_image(image_path)
+            image_context = f"\n\n[Image Context: The provided image appears to be a {classification}.]"
+        except Exception as e:
+            logger.error(f"Error processing image: {e}")
+    # Build context for the prompt
+    context = "\n\n---\n\n".join([chunk['text'] for chunk in retrieved_chunks])
+    if not context:
+        return "No relevant documents found for your query.", []
+    # Generate answer using OpenAI
+    prompt = f"""Use the following context to answer the question:
+{context}{image_context}
+Question: {question}
+Please provide a comprehensive answer based on the context provided. If the context doesn't contain enough information, say so."""
+    # For GPT-5, temperature must be default (1.0)
+    response = client.chat.completions.create(
+        model=OPENAI_CHAT_MODEL,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety. Always cite your sources when providing information."},
+            {"role": "user", "content": prompt}
+        ],
+        max_completion_tokens=DEFAULT_MAX_TOKENS
+    )
+    answer = response.choices[0].message.content
+    return answer, citations
+def query_with_feedback(question: str, feedback_scores: List[float] = None, top_k: int = 5) -> Tuple[str, List[dict]]:
+    """
+    Query with relevance feedback to refine results.
+    Args:
+        question: User's question
+        feedback_scores: Optional relevance scores for previous results
+        top_k: Number of relevant chunks to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    # For now, just use regular query
+    # TODO: Implement Rocchio algorithm or similar for relevance feedback
+    return query(question, top_k=top_k)
+if __name__ == "__main__":
+    # Test the vanilla query
+    test_questions = [
+        "What are general machine guarding requirements?",
+        "How do I perform lockout/tagout procedures?",
+        "What safety measures are needed for robotic systems?"
+    ]
+    for q in test_questions:
+        print(f"\nQuestion: {q}")
+        answer, citations = query(q)
+        print(f"Answer: {answer[:200]}...")
+        print(f"Citations: {[c['source'] for c in citations]}")
+        print("-" * 50)

query_vision.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Vision-based query module using GPT-5 Vision.
+Supports multimodal queries combining text and images.
+"""
+import base64
+import json
+import logging
+import sqlite3
+from typing import List, Tuple, Optional, Dict, Any
+import numpy as np
+from PIL import Image
+from openai import OpenAI
+from config import *
+from utils import ImageProcessor, classify_image
+logger = logging.getLogger(__name__)
+class VisionRetriever:
+    """Vision-based retrieval using GPT-5 Vision for image analysis and classification."""
+    def __init__(self):
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+        self.image_processor = ImageProcessor()
+    def get_similar_images(self, query_image_path: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Find similar images in the database based on classification similarity."""
+        try:
+            # Uses GPT-5 Vision for classification-based similarity search
+            # Note: This implementation uses classification similarity rather than embeddings
+            # Classify the query image
+            query_classification = classify_image(query_image_path)
+            # Query database for similar images
+            conn = sqlite3.connect(IMAGES_DB)
+            cursor = conn.cursor()
+            # Search for images with similar classification
+            cursor.execute("""
+                SELECT image_id, image_path, classification, metadata
+                FROM images
+                WHERE classification LIKE ?
+                ORDER BY created_at DESC
+                LIMIT ?
+            """, (f"%{query_classification}%", top_k))
+            results = cursor.fetchall()
+            conn.close()
+            similar_images = []
+            for row in results:
+                image_id, image_path, classification, metadata_json = row
+                metadata = json.loads(metadata_json) if metadata_json else {}
+                similar_images.append({
+                    'image_id': image_id,
+                    'image_path': image_path,
+                    'classification': classification,
+                    'metadata': metadata,
+                    'similarity_score': 0.8  # Classification-based similarity score
+                })
+            logger.info(f"Found {len(similar_images)} similar images for query")
+            return similar_images
+        except Exception as e:
+            logger.error(f"Error finding similar images: {e}")
+            return []
+    def analyze_image_safety(self, image_path: str, question: str = None) -> str:
+        """Analyze image for safety concerns using GPT-5 Vision."""
+        try:
+            # Convert image to base64
+            with open(image_path, "rb") as image_file:
+                image_b64 = base64.b64encode(image_file.read()).decode()
+            # Create analysis prompt
+            if question:
+                analysis_prompt = (
+                    f"Analyze this image in the context of the following question: {question}\n\n"
+                    "Please provide a detailed safety analysis covering:\n"
+                    "1. What equipment, machinery, or workplace elements are visible\n"
+                    "2. Any potential safety hazards or compliance issues\n"
+                    "3. Relevant OSHA standards or regulations that may apply\n"
+                    "4. Recommendations for safety improvements\n"
+                    "5. How this relates to the specific question asked"
+                )
+            else:
+                analysis_prompt = (
+                    "Analyze this image for occupational safety and health concerns. Provide:\n"
+                    "1. Description of what's shown in the image\n"
+                    "2. Identification of potential safety hazards\n"
+                    "3. Relevant OSHA standards or safety regulations\n"
+                    "4. Recommendations for improving safety"
+                )
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": analysis_prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}", "detail": "high"}}
+                ]
+            }]
+            # For GPT-5 vision, temperature must be default (1.0) and reasoning is not supported
+            response = self.client.chat.completions.create(
+                model=OPENAI_CHAT_MODEL,
+                messages=messages,
+                max_completion_tokens=DEFAULT_MAX_TOKENS
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            logger.error(f"Error analyzing image: {e}")
+            return f"I encountered an error while analyzing the image: {e}"
+    def retrieve_relevant_text(self, image_classification: str, question: str, top_k: int = 3) -> List[Dict[str, Any]]:
+        """Retrieve text documents relevant to the image classification and question."""
+        # This would integrate with other retrieval methods to find relevant text
+        # For now, we'll create a simple keyword-based search
+        try:
+            # Import other query modules for text retrieval
+            from query_vanilla import query as vanilla_query
+            # Create an enhanced query combining image classification and original question
+            enhanced_question = f"safety requirements for {image_classification} {question}"
+            # Use vanilla retrieval to find relevant text
+            _, text_citations = vanilla_query(enhanced_question, top_k=top_k)
+            return text_citations
+        except Exception as e:
+            logger.error(f"Error retrieving relevant text: {e}")
+            return []
+    def generate_multimodal_answer(self, question: str, image_analysis: str,
+                                 text_citations: List[Dict], similar_images: List[Dict]) -> str:
+        """Generate answer combining image analysis and text retrieval."""
+        try:
+            # Prepare context from text citations
+            text_context = ""
+            if text_citations:
+                text_parts = []
+                for i, citation in enumerate(text_citations, 1):
+                    if 'text' in citation:
+                        text_parts.append(f"[Text Source {i}] {citation['source']}: {citation['text'][:500]}...")
+                    else:
+                        text_parts.append(f"[Text Source {i}] {citation['source']}")
+                text_context = "\n\n".join(text_parts)
+            # Prepare context from similar images
+            image_context = ""
+            if similar_images:
+                image_parts = []
+                for img in similar_images[:3]:  # Limit to top 3
+                    source = img['metadata'].get('source', 'Unknown')
+                    classification = img.get('classification', 'unknown')
+                    image_parts.append(f"Similar image from {source}: classified as {classification}")
+                image_context = "\n".join(image_parts)
+            # Create comprehensive prompt
+            system_message = (
+                "You are an expert in occupational safety and health. "
+                "You have been provided with an image analysis, relevant text documents, "
+                "and information about similar images in the database. "
+                "Provide a comprehensive answer that integrates all this information."
+            )
+            user_message = f"""Question: {question}
+Image Analysis:
+{image_analysis}
+Relevant Text Documentation:
+{text_context}
+Similar Images Context:
+{image_context}
+Please provide a comprehensive answer that:
+1. Addresses the specific question asked
+2. Incorporates insights from the image analysis
+3. References relevant regulatory information from the text sources
+4. Notes any connections to similar cases or images
+5. Provides actionable recommendations based on safety standards"""
+            # For GPT-5, temperature must be default (1.0) and reasoning is not supported
+            response = self.client.chat.completions.create(
+                model=OPENAI_CHAT_MODEL,
+                messages=[
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": user_message}
+                ],
+                max_completion_tokens=DEFAULT_MAX_TOKENS * 2  # Allow longer response for comprehensive analysis
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            logger.error(f"Error generating multimodal answer: {e}")
+            return "I apologize, but I encountered an error while generating the comprehensive answer."
+# Global retriever instance
+_retriever = None
+def get_retriever() -> VisionRetriever:
+    """Get or create global vision retriever instance."""
+    global _retriever
+    if _retriever is None:
+        _retriever = VisionRetriever()
+    return _retriever
+def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict]]:
+    """
+    Main vision-based query function with unified signature.
+    Args:
+        question: User question
+        image_path: Path to image file (required for vision queries)
+        top_k: Number of relevant results to retrieve
+    Returns:
+        Tuple of (answer, citations)
+    """
+    if not image_path:
+        return "Vision queries require an image. Please provide an image file.", []
+    try:
+        retriever = get_retriever()
+        # Step 1: Analyze the provided image
+        logger.info(f"Analyzing image: {image_path}")
+        image_analysis = retriever.analyze_image_safety(image_path, question)
+        # Step 2: Classify the image
+        image_classification = classify_image(image_path)
+        # Step 3: Find similar images
+        similar_images = retriever.get_similar_images(image_path, top_k=3)
+        # Step 4: Retrieve relevant text documents
+        text_citations = retriever.retrieve_relevant_text(image_classification, question, top_k)
+        # Step 5: Generate comprehensive multimodal answer
+        answer = retriever.generate_multimodal_answer(
+            question, image_analysis, text_citations, similar_images
+        )
+        # Step 6: Prepare citations
+        citations = []
+        # Add image analysis as primary citation
+        citations.append({
+            'rank': 1,
+            'type': 'image_analysis',
+            'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}",
+            'method': 'vision',
+            'classification': image_classification,
+            'score': 1.0
+        })
+        # Add text citations
+        for i, citation in enumerate(text_citations, 2):
+            citation_copy = citation.copy()
+            citation_copy['rank'] = i
+            citation_copy['method'] = 'vision_text'
+            citations.append(citation_copy)
+        # Add similar images
+        for i, img in enumerate(similar_images):
+            citations.append({
+                'rank': len(citations) + 1,
+                'type': 'similar_image',
+                'source': img['metadata'].get('source', 'Image Database'),
+                'method': 'vision',
+                'classification': img.get('classification', 'unknown'),
+                'similarity_score': img.get('similarity_score', 0.0),
+                'image_id': img.get('image_id')
+            })
+        logger.info(f"Vision query completed. Generated {len(citations)} citations.")
+        return answer, citations
+    except Exception as e:
+        logger.error(f"Error in vision query: {e}")
+        error_message = "I apologize, but I encountered an error while processing your vision-based question."
+        return error_message, []
+def query_image_only(image_path: str, question: str = None) -> Tuple[str, List[Dict]]:
+    """
+    Analyze image without text retrieval (faster for simple image analysis).
+    Args:
+        image_path: Path to image file
+        question: Optional specific question about the image
+    Returns:
+        Tuple of (analysis, citations)
+    """
+    try:
+        retriever = get_retriever()
+        # Analyze image
+        analysis = retriever.analyze_image_safety(image_path, question)
+        # Classify image
+        classification = classify_image(image_path)
+        # Create citation for image analysis
+        citations = [{
+            'rank': 1,
+            'type': 'image_analysis',
+            'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}",
+            'method': 'vision_only',
+            'classification': classification,
+            'score': 1.0
+        }]
+        return analysis, citations
+    except Exception as e:
+        logger.error(f"Error in image-only analysis: {e}")
+        return "Error analyzing image.", []
+def query_with_details(question: str, image_path: Optional[str] = None,
+                      top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict], List[Tuple]]:
+    """
+    Vision query function that returns detailed chunk information (for compatibility).
+    Returns:
+        Tuple of (answer, citations, chunks)
+    """
+    answer, citations = query(question, image_path, top_k)
+    # Convert citations to chunk format for backward compatibility
+    chunks = []
+    for citation in citations:
+        if citation['type'] == 'image_analysis':
+            chunks.append((
+                f"Image Analysis ({citation['classification']})",
+                citation['score'],
+                "Analysis of uploaded image for safety compliance",
+                citation['source']
+            ))
+        elif citation['type'] == 'similar_image':
+            chunks.append((
+                f"Similar Image (Score: {citation.get('similarity_score', 0):.3f})",
+                citation.get('similarity_score', 0),
+                f"Similar image classified as {citation['classification']}",
+                citation['source']
+            ))
+        else:
+            chunks.append((
+                f"Text Reference {citation['rank']}",
+                citation.get('score', 0.5),
+                citation.get('text', 'Referenced document'),
+                citation['source']
+            ))
+    return answer, citations, chunks
+if __name__ == "__main__":
+    # Test the vision system (requires an actual image file)
+    import sys
+    if len(sys.argv) > 1:
+        test_image_path = sys.argv[1]
+        test_question = "What safety issues can you identify in this image?"
+        print("Testing vision retrieval system...")
+        print(f"Image: {test_image_path}")
+        print(f"Question: {test_question}")
+        print("-" * 50)
+        try:
+            answer, citations = query(test_question, test_image_path)
+            print("Answer:")
+            print(answer)
+            print(f"\nCitations ({len(citations)}):")
+            for citation in citations:
+                print(f"- {citation['source']} (Type: {citation.get('type', 'unknown')})")
+        except Exception as e:
+            print(f"Error during testing: {e}")
+    else:
+        print("To test vision system, provide an image path as argument:")
+        print("python query_vision.py /path/to/image.jpg")

realtime_server.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""
+FastAPI server for OpenAI Realtime API integration with RAG system.
+Provides endpoints for session management and RAG tool calls.
+Directory structure:
+/data/         # Original PDFs, HTML
+/embeddings/   # FAISS, Chroma, DPR vector stores
+/graph/        # Graph database files
+/metadata/     # Image metadata (SQLite or MongoDB)
+"""
+import json
+import logging
+import os
+import time
+from typing import Dict, Any, Optional
+from fastapi import FastAPI, HTTPException, Request, Response, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.exceptions import RequestValidationError
+from starlette.exceptions import HTTPException as StarletteHTTPException
+from pydantic import BaseModel
+import uvicorn
+from openai import OpenAI
+# Import all query modules
+from query_graph import query as graph_query
+from query_vanilla import query as vanilla_query
+from query_dpr import query as dpr_query
+from query_bm25 import query as bm25_query
+from query_context import query as context_query
+from query_vision import query as vision_query
+from config import OPENAI_API_KEY, OPENAI_CHAT_MODEL, OPENAI_REALTIME_MODEL, REALTIME_VOICE, REALTIME_INSTRUCTIONS, DEFAULT_METHOD
+from analytics_db import log_query
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(title="SIGHT Realtime API Server", version="1.0.0")
+# CORS middleware for frontend integration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, restrict to your domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Log all incoming requests for debugging."""
+    logger.info(f"Incoming request: {request.method} {request.url}")
+    try:
+        response = await call_next(request)
+        logger.info(f"Response status: {response.status_code}")
+        return response
+    except Exception as e:
+        logger.error(f"Request processing error: {e}")
+        return JSONResponse(
+            content={"error": "Internal server error"},
+            status_code=500
+        )
+# Exception handlers
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    logger.warning(f"Validation error for {request.url}: {exc}")
+    return JSONResponse(
+        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        content={"error": "Invalid request format", "details": str(exc)}
+    )
+@app.exception_handler(StarletteHTTPException)
+async def http_exception_handler(request: Request, exc: StarletteHTTPException):
+    logger.warning(f"HTTP error for {request.url}: {exc.status_code} - {exc.detail}")
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={"error": exc.detail}
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request: Request, exc: Exception):
+    logger.error(f"Unhandled error for {request.url}: {exc}")
+    return JSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={"error": "Internal server error"}
+    )
+# Initialize OpenAI client
+client = OpenAI(api_key=OPENAI_API_KEY)
+# Query method dispatch
+QUERY_DISPATCH = {
+    'graph': graph_query,
+    'vanilla': vanilla_query,
+    'dpr': dpr_query,
+    'bm25': bm25_query,
+    'context': context_query,
+    'vision': vision_query
+}
+# Use configuration from config.py with environment variable overrides
+REALTIME_MODEL = os.getenv("REALTIME_MODEL", OPENAI_REALTIME_MODEL)
+VOICE = os.getenv("REALTIME_VOICE", REALTIME_VOICE)
+INSTRUCTIONS = os.getenv("REALTIME_INSTRUCTIONS", REALTIME_INSTRUCTIONS)
+# Pydantic models for request/response
+class SessionRequest(BaseModel):
+    """Request model for creating ephemeral sessions."""
+    model: Optional[str] = "gpt-4o-realtime-preview"
+    instructions: Optional[str] = None
+    voice: Optional[str] = None
+class RAGRequest(BaseModel):
+    """Request model for RAG queries."""
+    query: str
+    method: str = "graph"
+    top_k: int = 5
+    image_path: Optional[str] = None
+class RAGResponse(BaseModel):
+    """Response model for RAG queries."""
+    answer: str
+    citations: list
+    method: str
+    citations_html: Optional[str] = None
+@app.post("/session")
+async def create_ephemeral_session(request: SessionRequest) -> JSONResponse:
+    """
+    Create an ephemeral session token for OpenAI Realtime API.
+    This token will be used by the frontend WebRTC client.
+    """
+    try:
+        logger.info(f"Creating ephemeral session with model: {request.model or REALTIME_MODEL}")
+        # Create ephemeral token using direct HTTP call to OpenAI API
+        # Since the Python SDK doesn't support realtime sessions yet
+        import requests
+        session_data = {
+            "model": request.model or REALTIME_MODEL,
+            "voice": request.voice or VOICE,
+            "modalities": ["audio", "text"],
+            "instructions": request.instructions or INSTRUCTIONS,
+        }
+        headers = {
+            "Authorization": f"Bearer {OPENAI_API_KEY}",
+            "Content-Type": "application/json"
+        }
+        # Make direct HTTP request to OpenAI's realtime sessions endpoint
+        response = requests.post(
+            "https://api.openai.com/v1/realtime/sessions",
+            json=session_data,
+            headers=headers,
+            timeout=30
+        )
+        if response.status_code == 200:
+            session_result = response.json()
+            response_data = {
+                "client_secret": session_result.get("client_secret", {}).get("value") or session_result.get("client_secret"),
+                "model": request.model or REALTIME_MODEL,
+                "session_id": session_result.get("id")
+            }
+            logger.info("Ephemeral session created successfully")
+            return JSONResponse(content=response_data, status_code=200)
+        else:
+            logger.error(f"OpenAI API error: {response.status_code} - {response.text}")
+            return JSONResponse(
+                content={"error": f"OpenAI API error: {response.status_code} - {response.text}"},
+                status_code=response.status_code
+            )
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Network error creating ephemeral session: {e}")
+        return JSONResponse(
+            content={"error": f"Network error: {str(e)}"},
+            status_code=500
+        )
+    except Exception as e:
+        logger.error(f"Error creating ephemeral session: {e}")
+        return JSONResponse(
+            content={"error": f"Session creation failed: {str(e)}"},
+            status_code=500
+        )
+@app.post("/rag", response_model=RAGResponse)
+async def rag_query(request: RAGRequest) -> RAGResponse:
+    """
+    Handle RAG queries from the realtime interface.
+    This endpoint is called by the JavaScript frontend when the model
+    requests the ask_rag function.
+    """
+    try:
+        logger.info(f"RAG query: {request.query} using method: {request.method}")
+        # Validate and default method if needed
+        method = request.method
+        if method not in QUERY_DISPATCH:
+            logger.warning(f"Invalid method '{method}', using default '{DEFAULT_METHOD}'")
+            method = DEFAULT_METHOD
+        # Get the appropriate query function
+        query_func = QUERY_DISPATCH[method]
+        # Execute the query
+        start_time = time.time()
+        answer, citations = query_func(
+            question=request.query,
+            image_path=request.image_path,
+            top_k=request.top_k
+        )
+        response_time = (time.time() - start_time) * 1000  # Convert to ms
+        # Format citations for HTML display (optional)
+        citations_html = format_citations_html(citations, method)
+        # Log to analytics database (mark as voice interaction)
+        try:
+            # Generate unique session ID for each voice interaction
+            import uuid
+            voice_session_id = f"voice_{uuid.uuid4().hex[:8]}"
+            log_query(
+                user_query=request.query,
+                method=method,
+                answer=answer,
+                citations=citations,
+                response_time=response_time,
+                image_path=request.image_path,
+                top_k=request.top_k,
+                session_id=voice_session_id,
+                additional_settings={'voice_interaction': True, 'interaction_type': 'speech_to_speech'}
+            )
+            logger.info(f"Voice interaction logged: {request.query[:50]}...")
+        except Exception as log_error:
+            logger.error(f"Failed to log voice query: {log_error}")
+        logger.info(f"RAG query completed: {len(answer)} chars, {len(citations)} citations")
+        return RAGResponse(
+            answer=answer,
+            citations=citations,
+            method=method,
+            citations_html=citations_html
+        )
+    except Exception as e:
+        logger.error(f"Error processing RAG query: {e}")
+        raise HTTPException(status_code=500, detail=f"RAG query failed: {str(e)}")
+def format_citations_html(citations: list, method: str) -> str:
+    """Format citations as HTML for display."""
+    if not citations:
+        return "<p><em>No citations available</em></p>"
+    html_parts = ["<div style='margin-top: 1em;'><strong>Sources:</strong><ul>"]
+    for citation in citations:
+        if isinstance(citation, dict) and 'source' in citation:
+            source = citation['source']
+            cite_type = citation.get('type', 'unknown')
+            # Build citation text based on type
+            if cite_type == 'pdf':
+                cite_text = f"📄 {source} (PDF)"
+            elif cite_type == 'html':
+                url = citation.get('url', '')
+                if url:
+                    cite_text = f"🌐 <a href='{url}' target='_blank'>{source}</a> (Web)"
+                else:
+                    cite_text = f"🌐 {source} (Web)"
+            elif cite_type == 'image':
+                page = citation.get('page', 'N/A')
+                cite_text = f"🖼️ {source} (Image, page {page})"
+            else:
+                cite_text = f"📚 {source}"
+            # Add scores if available
+            scores = []
+            if 'relevance_score' in citation:
+                scores.append(f"relevance: {citation['relevance_score']:.3f}")
+            if 'score' in citation:
+                scores.append(f"score: {citation['score']:.3f}")
+            if scores:
+                cite_text += f" <small>({', '.join(scores)})</small>"
+            html_parts.append(f"<li>{cite_text}</li>")
+        elif isinstance(citation, (list, tuple)) and len(citation) >= 4:
+            # Handle legacy citation format (header, score, text, source)
+            header, score, text, source = citation[:4]
+            cite_text = f"📚 {source} <small>(score: {score:.3f})</small>"
+            html_parts.append(f"<li>{cite_text}</li>")
+    html_parts.append("</ul></div>")
+    return "".join(html_parts)
+@app.get("/")
+async def root():
+    """Root endpoint to prevent invalid HTTP request warnings."""
+    return {
+        "service": "SIGHT Realtime API Server",
+        "version": "1.0.0",
+        "status": "running",
+        "endpoints": {
+            "session": "POST /session - Create realtime session",
+            "rag": "POST /rag - Query RAG system",
+            "health": "GET /health - Health check",
+            "methods": "GET /methods - List available RAG methods"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "service": "SIGHT Realtime API Server"}
+@app.get("/methods")
+async def list_methods():
+    """List available RAG methods."""
+    return {
+        "methods": list(QUERY_DISPATCH.keys()),
+        "descriptions": {
+            'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
+            'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
+            'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
+            'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
+            'context': "Context stuffing with full document loading and heuristic selection",
+            'vision': "Vision-based search using GPT-5 Vision for image analysis"
+        }
+    }
+@app.options("/{full_path:path}")
+async def options_handler(request: Request, response: Response):
+    """Handle CORS preflight requests."""
+    response.headers["Access-Control-Allow-Origin"] = "*"
+    response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
+    response.headers["Access-Control-Allow-Headers"] = "*"
+    return response
+if __name__ == "__main__":
+    import argparse
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="SIGHT Realtime API Server")
+    parser.add_argument("--https", action="store_true", help="Enable HTTPS with self-signed certificate")
+    parser.add_argument("--port", type=int, default=5050, help="Port to run the server on")
+    parser.add_argument("--host", default="0.0.0.0", help="Host to bind the server to")
+    args = parser.parse_args()
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    # Suppress uvicorn access logs for cleaner output
+    uvicorn_logger = logging.getLogger("uvicorn.access")
+    uvicorn_logger.setLevel(logging.WARNING)
+    # Prepare uvicorn configuration
+    uvicorn_config = {
+        "app": "realtime_server:app",
+        "host": args.host,
+        "port": args.port,
+        "reload": True,
+        "log_level": "warning",
+        "access_log": False
+    }
+    # Add SSL configuration if HTTPS is requested
+    if args.https:
+        logger.info("Starting server with HTTPS (self-signed certificate)")
+        logger.warning("⚠️  Self-signed certificate will show security warnings in browser")
+        logger.info("For production, use a proper SSL certificate from a CA")
+        # Note: You would need to generate SSL certificates
+        # For development, you can create self-signed certificates:
+        # openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -days 365 -nodes
+        uvicorn_config.update({
+            "ssl_keyfile": "key.pem",
+            "ssl_certfile": "cert.pem"
+        })
+        print(f"🔒 Starting HTTPS server on https://{args.host}:{args.port}")
+        print("📝 To generate self-signed certificates, run:")
+        print("   openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -days 365 -nodes")
+    else:
+        print(f"🌐 Starting HTTP server on http://{args.host}:{args.port}")
+        print("⚠️  HTTP only works for localhost. Use --https for production deployment.")
+    # Run the server
+    uvicorn.run(**uvicorn_config)

requirements.txt CHANGED Viewed

@@ -1,10 +1,63 @@
-python-dotenv==1.1.1
-pymupdf4llm==0.0.27
-beautifulsoup4==4.13.4
-requests==2.32.4
-pandas==2.2.3
-openai==1.97.1
-networkx==3.5
-numpy==2.3.1
-scikit-learn==1.7.1
-streamlit==1.47.0

+# Core dependencies (updated versions)
+python-dotenv>=1.1.1
+pymupdf4llm>=0.0.27
+beautifulsoup4>=4.13.4
+requests>=2.32.4
+pandas>=2.2.3
+openai>=1.99.9
+networkx>=3.5
+numpy>=2.3.1
+scikit-learn>=1.7.1
+streamlit>=1.47.0
+# FastAPI and realtime API dependencies
+fastapi>=0.104.0  # For realtime API server
+uvicorn[standard]>=0.24.0  # ASGI server for FastAPI
+pydantic>=2.4.0  # Data validation and settings management
+# Document processing
+pymupdf>=1.24.0  # For PDF processing and image extraction
+Pillow>=10.0.0  # For image processing
+lxml>=5.0.0  # For HTML parsing
+html5lib>=1.1  # Alternative HTML parser
+# Vector stores and search
+faiss-cpu>=1.8.0  # For vector similarity search (use faiss-gpu if CUDA available)
+chromadb>=0.5.0  # Alternative vector database
+rank-bm25>=0.2.2  # For BM25 keyword search
+# Language models and embeddings
+sentence-transformers>=3.0.0  # For DPR and cross-encoder
+transformers>=4.40.0  # Required by sentence-transformers
+torch>=2.0.0  # For neural models (CPU version)
+# For GPU support, install separately:
+# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
+ftfy>=6.1.1  # Text preprocessing for CLIP
+regex>=2023.0.0  # Text processing
+# For CLIP (optional - enable if needed):
+# git+https://github.com/openai/CLIP.git
+# Token counting and management
+tiktoken>=0.7.0  # For OpenAI token counting
+# Database (optional)
+# pymongo>=4.0.0  # Uncomment if using MongoDB for metadata
+# Development and debugging
+tqdm>=4.65.0  # Progress bars
+ipython>=8.0.0  # For interactive debugging
+jupyter>=1.0.0  # For notebook development
+# Data visualization (optional)
+matplotlib>=3.7.0  # For plotting
+seaborn>=0.12.0  # Statistical visualization
+plotly>=5.15.0  # Interactive plots
+# Optional advanced features (uncomment if needed)
+# langchain>=0.2.11  # For advanced RAG patterns
+# langchain-openai>=0.1.20  # OpenAI integration for LangChain
+# llama-index>=0.10.51  # Alternative RAG framework
+# Additional utility packages
+colorama>=0.4.6  # Colored console output
+rich>=13.0.0  # Rich text and beautiful formatting in terminal

utils.py ADDED Viewed

	@@ -0,0 +1,679 @@

+"""
+Utility functions for the Multi-Method RAG System.
+Directory Layout:
+/data/         # Original PDFs, HTML
+/embeddings/   # FAISS, Chroma, DPR vector stores
+/graph/        # Graph database files
+/metadata/     # Image metadata (SQLite or MongoDB)
+"""
+import os
+import json
+import pickle
+import sqlite3
+import base64
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional, Any, Union
+from dataclasses import dataclass
+import logging
+import pymupdf4llm
+import pymupdf
+import numpy as np
+import pandas as pd
+from PIL import Image
+import requests
+from bs4 import BeautifulSoup
+# Vector stores and search
+import faiss
+import chromadb
+from rank_bm25 import BM25Okapi
+import networkx as nx
+# ML models
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer, CrossEncoder
+import torch
+# import clip
+# Text processing
+from sklearn.feature_extraction.text import TfidfVectorizer
+import tiktoken
+from config import *
+logger = logging.getLogger(__name__)
+@dataclass
+class DocumentChunk:
+    """Data structure for document chunks."""
+    text: str
+    metadata: Dict[str, Any]
+    chunk_id: str
+    embedding: Optional[np.ndarray] = None
+@dataclass
+class ImageData:
+    """Data structure for image metadata."""
+    image_path: str
+    image_id: str
+    classification: Optional[str] = None
+    embedding: Optional[np.ndarray] = None
+    metadata: Optional[Dict[str, Any]] = None
+class DocumentLoader:
+    """Load and extract text from various document formats."""
+    def __init__(self):
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+        validate_api_key()
+    def load_pdf_documents(self, pdf_paths: List[Union[str, Path]]) -> List[Dict[str, Any]]:
+        """Load text from PDF files using pymupdf4llm."""
+        documents = []
+        for pdf_path in pdf_paths:
+            try:
+                pdf_path = Path(pdf_path)
+                logger.info(f"Loading PDF: {pdf_path}")
+                # Extract text using pymupdf4llm
+                text = pymupdf4llm.to_markdown(str(pdf_path))
+                # Extract images if present
+                images = self._extract_pdf_images(pdf_path)
+                doc = {
+                    'text': text,
+                    'source': str(pdf_path.name),
+                    'path': str(pdf_path),
+                    'type': 'pdf',
+                    'images': images,
+                    'metadata': {
+                        'file_size': pdf_path.stat().st_size,
+                        'modified': pdf_path.stat().st_mtime
+                    }
+                }
+                documents.append(doc)
+            except Exception as e:
+                logger.error(f"Error loading PDF {pdf_path}: {e}")
+                continue
+        return documents
+    def _extract_pdf_images(self, pdf_path: Path) -> List[Dict[str, Any]]:
+        """Extract images from PDF using pymupdf."""
+        images = []
+        try:
+            doc = pymupdf.open(str(pdf_path))
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                image_list = page.get_images(full=True)
+                for img_index, img in enumerate(image_list):
+                    try:
+                        # Extract image
+                        xref = img[0]
+                        pix = pymupdf.Pixmap(doc, xref)
+                        # Skip if pixmap is invalid or has no colorspace
+                        if not pix or pix.colorspace is None:
+                            if pix:
+                                pix = None
+                            continue
+                        # Only process images with valid color channels
+                        if pix.n - pix.alpha < 4:  # GRAY or RGB
+                            image_id = f"{pdf_path.stem}_p{page_num}_img{img_index}"
+                            image_path = IMAGES_DIR / f"{image_id}.png"
+                            # Convert to RGB if grayscale or other formats
+                            if pix.n == 1:  # Grayscale
+                                rgb_pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                                pix = None  # Clean up original
+                                pix = rgb_pix
+                            elif pix.n == 4 and pix.alpha == 0:  # CMYK
+                                rgb_pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                                pix = None  # Clean up original
+                                pix = rgb_pix
+                            # Save image
+                            pix.save(str(image_path))
+                            images.append({
+                                'image_id': image_id,
+                                'image_path': str(image_path),
+                                'page': page_num,
+                                'source': str(pdf_path.name)
+                            })
+                        pix = None
+                    except Exception as e:
+                        logger.warning(f"Error extracting image {img_index} from page {page_num}: {e}")
+                        if 'pix' in locals() and pix:
+                            pix = None
+                        continue
+            doc.close()
+        except Exception as e:
+            logger.error(f"Error extracting images from {pdf_path}: {e}")
+        return images
+    def load_html_documents(self, html_sources: List[Dict[str, str]]) -> List[Dict[str, Any]]:
+        """Load text from HTML sources."""
+        documents = []
+        for source in html_sources:
+            try:
+                logger.info(f"Loading HTML: {source.get('title', source['url'])}")
+                # Fetch HTML content
+                response = requests.get(source['url'], timeout=30)
+                response.raise_for_status()
+                # Parse with BeautifulSoup
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Extract text
+                text = soup.get_text(separator=' ', strip=True)
+                doc = {
+                    'text': text,
+                    'source': source.get('title', source['url']),
+                    'path': source['url'],
+                    'type': 'html',
+                    'images': [],
+                    'metadata': {
+                        'url': source['url'],
+                        'title': source.get('title', ''),
+                        'year': source.get('year', ''),
+                        'category': source.get('category', ''),
+                        'format': source.get('format', 'HTML')
+                    }
+                }
+                documents.append(doc)
+            except Exception as e:
+                logger.error(f"Error loading HTML {source['url']}: {e}")
+                continue
+        return documents
+    def load_text_documents(self, data_dir: Path = DATA_DIR) -> List[Dict[str, Any]]:
+        """Load all supported document types from data directory."""
+        documents = []
+        # Load PDFs
+        pdf_files = list(data_dir.glob("*.pdf"))
+        if pdf_files:
+            documents.extend(self.load_pdf_documents(pdf_files))
+        # Load HTML sources (from config)
+        if DEFAULT_HTML_SOURCES:
+            documents.extend(self.load_html_documents(DEFAULT_HTML_SOURCES))
+        logger.info(f"Loaded {len(documents)} documents total")
+        return documents
+class TextPreprocessor:
+    """Preprocess text for different retrieval methods."""
+    def __init__(self):
+        self.encoding = tiktoken.get_encoding("cl100k_base")
+    def chunk_text_by_tokens(self, text: str, chunk_size: int = CHUNK_SIZE,
+                           overlap: int = CHUNK_OVERLAP) -> List[str]:
+        """Split text into chunks by token count."""
+        tokens = self.encoding.encode(text)
+        chunks = []
+        start = 0
+        while start < len(tokens):
+            end = start + chunk_size
+            chunk_tokens = tokens[start:end]
+            chunk_text = self.encoding.decode(chunk_tokens)
+            chunks.append(chunk_text)
+            start = end - overlap
+        return chunks
+    def chunk_text_by_sections(self, text: str, method: str = "vanilla") -> List[str]:
+        """Split text by sections based on method requirements."""
+        if method in ["vanilla", "dpr"]:
+            return self.chunk_text_by_tokens(text)
+        elif method == "bm25":
+            # BM25 works better with paragraph-level chunks
+            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+            return paragraphs
+        elif method == "graph":
+            # Graph method uses larger sections
+            return self.chunk_text_by_tokens(text, chunk_size=CHUNK_SIZE*2)
+        elif method == "context_stuffing":
+            # Context stuffing uses full documents
+            return [text]
+        else:
+            return self.chunk_text_by_tokens(text)
+    def preprocess_for_method(self, documents: List[Dict[str, Any]],
+                            method: str) -> List[DocumentChunk]:
+        """Preprocess documents for specific retrieval method."""
+        chunks = []
+        for doc in documents:
+            text_chunks = self.chunk_text_by_sections(doc['text'], method)
+            for i, chunk_text in enumerate(text_chunks):
+                chunk_id = f"{doc['source']}_{method}_chunk_{i}"
+                chunk = DocumentChunk(
+                    text=chunk_text,
+                    metadata={
+                        'source': doc['source'],
+                        'path': doc['path'],
+                        'type': doc['type'],
+                        'chunk_index': i,
+                        'method': method,
+                        **doc.get('metadata', {})
+                    },
+                    chunk_id=chunk_id
+                )
+                chunks.append(chunk)
+        logger.info(f"Created {len(chunks)} chunks for method '{method}'")
+        return chunks
+class EmbeddingGenerator:
+    """Generate embeddings using various models."""
+    def __init__(self):
+        self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+        self.sentence_transformer = None
+        # self.clip_model = None
+        # self.clip_preprocess = None
+    def _get_sentence_transformer(self):
+        """Lazy loading of sentence transformer."""
+        if self.sentence_transformer is None:
+            self.sentence_transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
+            if DEVICE == "cuda":
+                self.sentence_transformer = self.sentence_transformer.to(DEVICE)
+        return self.sentence_transformer
+    # def _get_clip_model(self):
+    #     """Lazy loading of CLIP model."""
+    #     if self.clip_model is None:
+    #         self.clip_model, self.clip_preprocess = clip.load(CLIP_MODEL, device=DEVICE)
+    #     return self.clip_model, self.clip_preprocess
+    def embed_text_openai(self, texts: List[str]) -> np.ndarray:
+        """Generate embeddings using OpenAI API."""
+        embeddings = []
+        # Process in batches
+        for i in range(0, len(texts), EMBEDDING_BATCH_SIZE):
+            batch = texts[i:i + EMBEDDING_BATCH_SIZE]
+            try:
+                response = self.openai_client.embeddings.create(
+                    model=OPENAI_EMBEDDING_MODEL,
+                    input=batch
+                )
+                batch_embeddings = [data.embedding for data in response.data]
+                embeddings.extend(batch_embeddings)
+            except Exception as e:
+                logger.error(f"Error generating OpenAI embeddings: {e}")
+                raise
+        return np.array(embeddings)
+    def embed_text_sentence_transformer(self, texts: List[str]) -> np.ndarray:
+        """Generate embeddings using sentence transformers."""
+        model = self._get_sentence_transformer()
+        try:
+            embeddings = model.encode(texts, convert_to_numpy=True,
+                                    show_progress_bar=True, batch_size=32)
+            return embeddings
+        except Exception as e:
+            logger.error(f"Error generating sentence transformer embeddings: {e}")
+            raise
+    def embed_image_clip(self, image_paths: List[str]) -> np.ndarray:
+        """Generate image embeddings using CLIP."""
+        # model, preprocess = self._get_clip_model()
+        # embeddings = []
+        # for image_path in image_paths:
+        #     try:
+        #         image = preprocess(Image.open(image_path)).unsqueeze(0).to(DEVICE)
+        #
+        #         with torch.no_grad():
+        #             image_features = model.encode_image(image)
+        #             image_features /= image_features.norm(dim=-1, keepdim=True)
+        #
+        #         embeddings.append(image_features.cpu().numpy().flatten())
+        #
+        #     except Exception as e:
+        #         logger.error(f"Error embedding image {image_path}: {e}")
+        #         continue
+        # return np.array(embeddings) if embeddings else np.array([])
+        # Placeholder for CLIP embeddings
+        logger.warning("CLIP embeddings not implemented - returning dummy embeddings")
+        return np.random.rand(len(image_paths), 512)
+class VectorStoreManager:
+    """Manage vector stores for different methods."""
+    def __init__(self):
+        self.embedding_generator = EmbeddingGenerator()
+    def build_faiss_index(self, chunks: List[DocumentChunk], method: str = "vanilla") -> Tuple[Any, List[Dict]]:
+        """Build FAISS index for vanilla or DPR method."""
+        # Generate embeddings
+        texts = [chunk.text for chunk in chunks]
+        if method == "vanilla":
+            embeddings = self.embedding_generator.embed_text_openai(texts)
+        elif method == "dpr":
+            embeddings = self.embedding_generator.embed_text_sentence_transformer(texts)
+        else:
+            raise ValueError(f"Unsupported method for FAISS: {method}")
+        # Build FAISS index
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
+        # Ensure embeddings are float32 and normalize for cosine similarity
+        embeddings = embeddings.astype(np.float32)
+        faiss.normalize_L2(embeddings)
+        index.add(embeddings)
+        # Store chunk metadata
+        metadata = []
+        for i, chunk in enumerate(chunks):
+            metadata.append({
+                'chunk_id': chunk.chunk_id,
+                'text': chunk.text,
+                'metadata': chunk.metadata,
+                'embedding': embeddings[i].tolist()
+            })
+        logger.info(f"Built FAISS index with {index.ntotal} vectors for method '{method}'")
+        return index, metadata
+    def build_chroma_index(self, chunks: List[DocumentChunk], method: str = "vanilla") -> Any:
+        """Build Chroma vector database."""
+        # Initialize Chroma client
+        chroma_client = chromadb.PersistentClient(path=str(CHROMA_PATH / method))
+        collection = chroma_client.get_or_create_collection(
+            name=f"{method}_collection",
+            metadata={"method": method}
+        )
+        # Prepare data for Chroma
+        texts = [chunk.text for chunk in chunks]
+        ids = [chunk.chunk_id for chunk in chunks]
+        metadatas = [chunk.metadata for chunk in chunks]
+        # Add to collection (Chroma handles embeddings internally)
+        collection.add(
+            documents=texts,
+            ids=ids,
+            metadatas=metadatas
+        )
+        logger.info(f"Built Chroma collection with {collection.count()} documents for method '{method}'")
+        return collection
+    def build_bm25_index(self, chunks: List[DocumentChunk]) -> BM25Okapi:
+        """Build BM25 index for keyword search."""
+        # Tokenize texts
+        tokenized_corpus = []
+        for chunk in chunks:
+            tokens = chunk.text.lower().split()
+            tokenized_corpus.append(tokens)
+        # Build BM25 index
+        bm25 = BM25Okapi(tokenized_corpus, k1=BM25_K1, b=BM25_B)
+        logger.info(f"Built BM25 index with {len(tokenized_corpus)} documents")
+        return bm25
+    def build_graph_index(self, chunks: List[DocumentChunk]) -> nx.Graph:
+        """Build NetworkX graph for graph-based retrieval."""
+        # Create graph
+        G = nx.Graph()
+        # Generate embeddings for similarity calculation
+        texts = [chunk.text for chunk in chunks]
+        embeddings = self.embedding_generator.embed_text_openai(texts)
+        # Add nodes (convert embeddings to lists for GML serialization)
+        for i, chunk in enumerate(chunks):
+            G.add_node(chunk.chunk_id,
+                      text=chunk.text,
+                      metadata=chunk.metadata,
+                      embedding=embeddings[i].tolist())  # Convert to list for serialization
+        # Add edges based on similarity
+        threshold = 0.7  # Similarity threshold
+        for i in range(len(chunks)):
+            for j in range(i + 1, len(chunks)):
+                # Calculate cosine similarity
+                sim = np.dot(embeddings[i], embeddings[j]) / (
+                    np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j])
+                )
+                if sim > threshold:
+                    G.add_edge(chunks[i].chunk_id, chunks[j].chunk_id,
+                              weight=float(sim))
+        logger.info(f"Built graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
+        return G
+    def save_index(self, index: Any, metadata: Any, method: str):
+        """Save index and metadata to disk."""
+        if method == "vanilla":
+            faiss.write_index(index, str(VANILLA_FAISS_INDEX))
+            with open(VANILLA_METADATA, 'wb') as f:
+                pickle.dump(metadata, f)
+        elif method == "dpr":
+            faiss.write_index(index, str(DPR_FAISS_INDEX))
+            with open(DPR_METADATA, 'wb') as f:
+                pickle.dump(metadata, f)
+        elif method == "bm25":
+            with open(BM25_INDEX, 'wb') as f:
+                pickle.dump({'index': index, 'texts': metadata}, f)
+        elif method == "context_stuffing":
+            with open(CONTEXT_DOCS, 'wb') as f:
+                pickle.dump(metadata, f)
+        elif method == "graph":
+            nx.write_gml(index, str(GRAPH_FILE))
+        logger.info(f"Saved {method} index to disk")
+class ImageProcessor:
+    """Process and classify images."""
+    def __init__(self):
+        self.embedding_generator = EmbeddingGenerator()
+        self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+        self._init_database()
+    def _init_database(self):
+        """Initialize SQLite database for image metadata."""
+        conn = sqlite3.connect(IMAGES_DB)
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS images (
+                image_id TEXT PRIMARY KEY,
+                image_path TEXT NOT NULL,
+                classification TEXT,
+                metadata TEXT,
+                embedding BLOB,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        ''')
+        conn.commit()
+        conn.close()
+    def classify_image(self, image_path: str) -> str:
+        """Classify image using GPT-5 Vision."""
+        try:
+            # Convert image to base64
+            with open(image_path, "rb") as image_file:
+                image_b64 = base64.b64encode(image_file.read()).decode()
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Classify this image in 1-2 words (e.g., 'machine guard', 'press brake', 'conveyor belt', 'safety sign')."},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}", "detail": "low"}}
+                ]
+            }]
+            # For GPT-5 vision, temperature must be default (1.0)
+            response = self.openai_client.chat.completions.create(
+                model=OPENAI_CHAT_MODEL,
+                messages=messages,
+                max_completion_tokens=50
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            logger.error(f"Error classifying image {image_path}: {e}")
+            return "unknown"
+    def should_filter_image(self, image_path: str) -> tuple[bool, str]:
+        """
+        Check if image should be filtered out based on height and black image criteria.
+        Args:
+            image_path: Path to the image file
+        Returns:
+            Tuple of (should_filter: bool, reason: str)
+        """
+        try:
+            from PIL import Image
+            import numpy as np
+            # Open and analyze the image
+            with Image.open(image_path) as img:
+                # Convert to RGB if needed
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                width, height = img.size
+                # Filter 1: Height less than 40 pixels
+                if height < 40:
+                    return True, f"height too small ({height}px)"
+                # Filter 2: Check if image is mostly black
+                img_array = np.array(img)
+                mean_brightness = np.mean(img_array)
+                # If mean brightness is very low (mostly black)
+                if mean_brightness < 10:  # Adjust threshold as needed
+                    return True, "mostly black image"
+        except Exception as e:
+            logger.warning(f"Error analyzing image {image_path}: {e}")
+            # If we can't analyze it, don't filter it out
+            return False, "analysis failed"
+        return False, "passed all filters"
+    def store_image_metadata(self, image_data: ImageData):
+        """Store image metadata in database."""
+        conn = sqlite3.connect(IMAGES_DB)
+        cursor = conn.cursor()
+        # Serialize metadata and embedding
+        metadata_json = json.dumps(image_data.metadata) if image_data.metadata else None
+        embedding_blob = image_data.embedding.tobytes() if image_data.embedding is not None else None
+        cursor.execute('''
+            INSERT OR REPLACE INTO images
+            (image_id, image_path, classification, metadata, embedding)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (image_data.image_id, image_data.image_path,
+              image_data.classification, metadata_json, embedding_blob))
+        conn.commit()
+        conn.close()
+    def get_image_metadata(self, image_id: str) -> Optional[ImageData]:
+        """Retrieve image metadata from database."""
+        conn = sqlite3.connect(IMAGES_DB)
+        cursor = conn.cursor()
+        cursor.execute('''
+            SELECT image_id, image_path, classification, metadata, embedding
+            FROM images WHERE image_id = ?
+        ''', (image_id,))
+        row = cursor.fetchone()
+        conn.close()
+        if row:
+            image_id, image_path, classification, metadata_json, embedding_blob = row
+            metadata = json.loads(metadata_json) if metadata_json else None
+            embedding = np.frombuffer(embedding_blob, dtype=np.float32) if embedding_blob else None
+            return ImageData(
+                image_path=image_path,
+                image_id=image_id,
+                classification=classification,
+                embedding=embedding,
+                metadata=metadata
+            )
+        return None
+def load_text_documents() -> List[Dict[str, Any]]:
+    """Convenience function to load all text documents."""
+    loader = DocumentLoader()
+    return loader.load_text_documents()
+def embed_image_clip(image_paths: List[str]) -> np.ndarray:
+    """Convenience function to embed images with CLIP."""
+    generator = EmbeddingGenerator()
+    return generator.embed_image_clip(image_paths)
+def store_image_metadata(image_data: ImageData):
+    """Convenience function to store image metadata."""
+    processor = ImageProcessor()
+    processor.store_image_metadata(image_data)
+def classify_image(image_path: str) -> str:
+    """Convenience function to classify an image."""
+    processor = ImageProcessor()
+    return processor.classify_image(image_path)