Spaces:
Paused
Paused
| """ | |
| Central configuration file for the Multi-Method RAG System. | |
| All shared parameters and settings are defined here. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv(override=True) | |
| # ==================== Versioning and Date ==================== | |
| DATE = "August 13, 2025" | |
| VERSION = "2.0.1" | |
| # ==================== API Configuration ==================== | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| OPENAI_CHAT_MODEL = "gpt-5-chat-latest" # This is the non-reasoning model for gpt-5 so it has no latency | |
| OPENAI_EMBEDDING_MODEL = "text-embedding-3-large" # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002 | |
| # ==================== Realtime API Configuration ==================== | |
| # OpenAI Realtime API settings for speech-to-speech functionality | |
| OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview" # Realtime model for speech-to-speech | |
| REALTIME_VOICE = "alloy" # Available voices: alloy, echo, fable, onyx, nova, shimmer | |
| REALTIME_INSTRUCTIONS = ( | |
| "You are a knowledgeable safety expert speaking naturally in conversation. " | |
| "VOICE BEHAVIOR: " | |
| "- Speak like a confident safety professional talking to a colleague " | |
| "- Acknowledge what you heard: 'You're asking about [topic]...' " | |
| "- Use natural speech with appropriate pauses and emphasis " | |
| "- Sound authoritative and knowledgeable - you ARE the expert " | |
| "- Never mention document names, page numbers, or citation details when speaking " | |
| "- Just state the facts naturally as if you know them from your expertise " | |
| "RESPONSE PROCESS: " | |
| "1. Briefly acknowledge the question: 'You're asking about [topic]...' " | |
| "2. Call ask_rag to get the accurate information " | |
| "3. Speak the information naturally as YOUR expertise, not as 'according to document X' " | |
| "4. Organize complex topics: 'There are three key requirements here...' " | |
| "5. Be thorough but conversational - like explaining to a colleague " | |
| "CITATION RULE: " | |
| "NEVER mention specific documents, sources, or page numbers in speech. " | |
| "Just state the information confidently as if it's your professional knowledge. " | |
| "For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' " | |
| "IMPORTANT: Always use ask_rag for safety questions to get accurate information, " | |
| "but speak the results as your own expertise, not as citations." | |
| ) | |
| # ==================== Model Parameters ==================== | |
| # Generation parameters | |
| DEFAULT_TEMPERATURE = 0 # Range: 0.0-1.0 (0=deterministic, 1=creative) | |
| DEFAULT_MAX_TOKENS = 5000 # Maximum tokens in response | |
| DEFAULT_TOP_K = 5 # Number of chunks to retrieve by default | |
| DEFAULT_TOP_P = 1.0 # Nucleus sampling parameter | |
| # Context window management | |
| MAX_CONTEXT_TOKENS = 7500 # Maximum context for models with 8k window | |
| CHUNK_SIZE = 2000 # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens) | |
| CHUNK_OVERLAP = 200 # Token overlap between chunks | |
| # ==================== Embedding Models ==================== | |
| # Sentence Transformers models | |
| SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2' # For DPR | |
| CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # For re-ranking | |
| # CLIP model | |
| CLIP_MODEL = "ViT-L/14" # Options: ViT-B/32, ViT-L/14, RN50 | |
| # ==================== Search Parameters ==================== | |
| # BM25 parameters | |
| BM25_K1 = 1.5 # Term frequency saturation parameter | |
| BM25_B = 0.75 # Length normalization parameter | |
| # Hybrid search | |
| DEFAULT_HYBRID_ALPHA = 0.5 # Weight for BM25 (1-alpha for semantic) | |
| # Re-ranking | |
| RERANK_MULTIPLIER = 2 # Retrieve this many times top_k for re-ranking | |
| MIN_RELEVANCE_SCORE = 0.3 # Minimum score threshold | |
| # ==================== Directory Structure ==================== | |
| # Project directories | |
| PROJECT_ROOT = Path(__file__).parent | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings" | |
| GRAPH_DIR = PROJECT_ROOT / "graph" | |
| METADATA_DIR = PROJECT_ROOT / "metadata" | |
| IMAGES_DIR = DATA_DIR / "images" | |
| # File paths | |
| VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index" | |
| VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl" | |
| DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index" | |
| DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl" | |
| BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl" | |
| CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl" | |
| GRAPH_FILE = GRAPH_DIR / "graph.gml" | |
| IMAGES_DB = METADATA_DIR / "images.db" | |
| CHROMA_PATH = EMBEDDINGS_DIR / "chroma" | |
| # ==================== Batch Processing ==================== | |
| EMBEDDING_BATCH_SIZE = 100 # Batch size for OpenAI embeddings | |
| PROCESSING_BATCH_SIZE = 50 # Documents to process at once | |
| # ==================== UI Configuration ==================== | |
| # Streamlit settings | |
| MAX_CHAT_HISTORY = 5 # Maximum chat messages to keep | |
| EXAMPLE_QUESTIONS = [ | |
| "What are general machine guarding requirements?", | |
| "How do I perform lockout/tagout?", | |
| "What safety measures are needed for robotic systems?", | |
| "Explain the difference between guards and devices in machine safety.", | |
| "What are the OSHA requirements for emergency stops?", | |
| ] | |
| # Default method | |
| DEFAULT_METHOD = "graph" | |
| # Method descriptions for UI | |
| METHOD_DESCRIPTIONS = { | |
| 'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval", | |
| 'vanilla': "Standard vector search with FAISS and OpenAI embeddings", | |
| 'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking", | |
| 'bm25': "BM25 keyword search with neural re-ranking for exact term matching", | |
| 'context': "Context stuffing with full document loading and heuristic selection", | |
| 'vision': "Vision-based search using GPT-5 Vision for image analysis and classification" | |
| } | |
| # ==================== Document Processing ==================== | |
| # Document types | |
| SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html'] | |
| IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif'] | |
| # Text splitting | |
| MARKDOWN_HEADER_LEVEL = 3 # Split by this header level (###) | |
| MAX_SECTIONS_PER_DOC = 500 # Maximum sections to extract from a document | |
| # ==================== Logging ==================== | |
| LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") # DEBUG, INFO, WARNING, ERROR | |
| LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| # ==================== Performance ==================== | |
| # Device configuration | |
| import torch | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| NUM_WORKERS = 4 # Parallel processing workers | |
| # Cache settings | |
| ENABLE_CACHE = True | |
| CACHE_TTL = 3600 # Cache time-to-live in seconds | |
| # ==================== Safety & Validation ==================== | |
| # Input validation | |
| MAX_QUESTION_LENGTH = 1000 # Maximum characters in a question | |
| MAX_IMAGE_SIZE_MB = 10 # Maximum image file size | |
| # Rate limiting (if needed) | |
| RATE_LIMIT_ENABLED = False | |
| MAX_QUERIES_PER_MINUTE = 60 | |
| # ==================== Default HTML Sources ==================== | |
| DEFAULT_HTML_SOURCES = [ | |
| { | |
| "title": "NIOSH Robotics in the Workplace – Safety Overview", | |
| "url": "https://www.cdc.gov/niosh/robotics/about/", | |
| "source": "NIOSH", | |
| "year": 2024, | |
| "category": "Technical Guide", | |
| "format": "HTML" | |
| } | |
| ] | |
| # ==================== Helper Functions ==================== | |
| def ensure_directories(): | |
| """Create all required directories if they don't exist.""" | |
| for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]: | |
| directory.mkdir(parents=True, exist_ok=True) | |
| def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int: | |
| """Get the context length for a given model.""" | |
| context_lengths = { | |
| "gpt-5": 128000, | |
| "gpt-4o-mini": 8192, | |
| "gpt-4o": 128000, | |
| } | |
| return context_lengths.get(model_name, 4096) | |
| def validate_api_key(): | |
| """Check if OpenAI API key is set.""" | |
| if not OPENAI_API_KEY: | |
| raise ValueError( | |
| "OpenAI API key not found. Please set OPENAI_API_KEY in .env file." | |
| ) | |
| return True | |
| # ==================== System Info ==================== | |
| def print_config(): | |
| """Print current configuration for debugging.""" | |
| print("="*50) | |
| print("RAG System Configuration") | |
| print("="*50) | |
| print(f"OpenAI Model: {OPENAI_CHAT_MODEL}") | |
| print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}") | |
| print(f"Device: {DEVICE}") | |
| print(f"Default Temperature: {DEFAULT_TEMPERATURE}") | |
| print(f"Default Top-K: {DEFAULT_TOP_K}") | |
| print(f"Chunk Size: {CHUNK_SIZE}") | |
| print(f"Project Root: {PROJECT_ROOT}") | |
| print("="*50) | |
| # Ensure directories exist on import | |
| ensure_directories() |