Spaces:

fmegahed
/

sight_chat

Paused

File size: 8,829 Bytes

ef821d9

"""

Central configuration file for the Multi-Method RAG System.

All shared parameters and settings are defined here.

"""

import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

# ==================== Versioning and Date ====================
DATE = "August 13, 2025"
VERSION = "2.0.1"


# ==================== API Configuration ====================
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_CHAT_MODEL = "gpt-5-chat-latest"  # This is the non-reasoning model for gpt-5 so it has no latency
OPENAI_EMBEDDING_MODEL = "text-embedding-3-large"  # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002

# ==================== Realtime API Configuration ====================
# OpenAI Realtime API settings for speech-to-speech functionality
OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview"  # Realtime model for speech-to-speech
REALTIME_VOICE = "alloy"  # Available voices: alloy, echo, fable, onyx, nova, shimmer
REALTIME_INSTRUCTIONS = (
    "You are a knowledgeable safety expert speaking naturally in conversation. "
    
    "VOICE BEHAVIOR: "
    "- Speak like a confident safety professional talking to a colleague "
    "- Acknowledge what you heard: 'You're asking about [topic]...' "
    "- Use natural speech with appropriate pauses and emphasis "
    "- Sound authoritative and knowledgeable - you ARE the expert "
    "- Never mention document names, page numbers, or citation details when speaking "
    "- Just state the facts naturally as if you know them from your expertise "
    
    "RESPONSE PROCESS: "
    "1. Briefly acknowledge the question: 'You're asking about [topic]...' "
    "2. Call ask_rag to get the accurate information "
    "3. Speak the information naturally as YOUR expertise, not as 'according to document X' "
    "4. Organize complex topics: 'There are three key requirements here...' "
    "5. Be thorough but conversational - like explaining to a colleague "
    
    "CITATION RULE: "
    "NEVER mention specific documents, sources, or page numbers in speech. "
    "Just state the information confidently as if it's your professional knowledge. "
    "For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' "
    
    "IMPORTANT: Always use ask_rag for safety questions to get accurate information, "
    "but speak the results as your own expertise, not as citations."
)

# ==================== Model Parameters ====================
# Generation parameters
DEFAULT_TEMPERATURE = 0  # Range: 0.0-1.0 (0=deterministic, 1=creative)
DEFAULT_MAX_TOKENS = 5000  # Maximum tokens in response
DEFAULT_TOP_K = 5  # Number of chunks to retrieve by default
DEFAULT_TOP_P = 1.0  # Nucleus sampling parameter

# Context window management
MAX_CONTEXT_TOKENS = 7500  # Maximum context for models with 8k window
CHUNK_SIZE = 2000  # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens)
CHUNK_OVERLAP = 200  # Token overlap between chunks

# ==================== Embedding Models ====================
# Sentence Transformers models
SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'  # For DPR
CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'  # For re-ranking

# CLIP model
CLIP_MODEL = "ViT-L/14"  # Options: ViT-B/32, ViT-L/14, RN50

# ==================== Search Parameters ====================
# BM25 parameters
BM25_K1 = 1.5  # Term frequency saturation parameter
BM25_B = 0.75  # Length normalization parameter

# Hybrid search
DEFAULT_HYBRID_ALPHA = 0.5  # Weight for BM25 (1-alpha for semantic)

# Re-ranking
RERANK_MULTIPLIER = 2  # Retrieve this many times top_k for re-ranking
MIN_RELEVANCE_SCORE = 0.3  # Minimum score threshold

# ==================== Directory Structure ====================
# Project directories
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data"
EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings"
GRAPH_DIR = PROJECT_ROOT / "graph"
METADATA_DIR = PROJECT_ROOT / "metadata"
IMAGES_DIR = DATA_DIR / "images"

# File paths
VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index"
VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl"
DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index"
DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl"
BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl"
CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl"
GRAPH_FILE = GRAPH_DIR / "graph.gml"
IMAGES_DB = METADATA_DIR / "images.db"
CHROMA_PATH = EMBEDDINGS_DIR / "chroma"

# ==================== Batch Processing ====================
EMBEDDING_BATCH_SIZE = 100  # Batch size for OpenAI embeddings
PROCESSING_BATCH_SIZE = 50  # Documents to process at once

# ==================== UI Configuration ====================
# Streamlit settings
MAX_CHAT_HISTORY = 5  # Maximum chat messages to keep
EXAMPLE_QUESTIONS = [
    "What are general machine guarding requirements?",
    "How do I perform lockout/tagout?",
    "What safety measures are needed for robotic systems?",
    "Explain the difference between guards and devices in machine safety.",
    "What are the OSHA requirements for emergency stops?",
]

# Default method
DEFAULT_METHOD = "graph"

# Method descriptions for UI
METHOD_DESCRIPTIONS = {
    'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
    'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
    'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
    'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
    'context': "Context stuffing with full document loading and heuristic selection",
    'vision': "Vision-based search using GPT-5 Vision for image analysis and classification"
}

# ==================== Document Processing ====================
# Document types
SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html']
IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']

# Text splitting
MARKDOWN_HEADER_LEVEL = 3  # Split by this header level (###)
MAX_SECTIONS_PER_DOC = 500  # Maximum sections to extract from a document

# ==================== Logging ====================
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")  # DEBUG, INFO, WARNING, ERROR
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

# ==================== Performance ====================
# Device configuration
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = 4  # Parallel processing workers

# Cache settings
ENABLE_CACHE = True
CACHE_TTL = 3600  # Cache time-to-live in seconds

# ==================== Safety & Validation ====================
# Input validation
MAX_QUESTION_LENGTH = 1000  # Maximum characters in a question
MAX_IMAGE_SIZE_MB = 10  # Maximum image file size

# Rate limiting (if needed)
RATE_LIMIT_ENABLED = False
MAX_QUERIES_PER_MINUTE = 60

# ==================== Default HTML Sources ====================
DEFAULT_HTML_SOURCES = [
    {
        "title": "NIOSH Robotics in the Workplace – Safety Overview",
        "url": "https://www.cdc.gov/niosh/robotics/about/",
        "source": "NIOSH",
        "year": 2024,
        "category": "Technical Guide",
        "format": "HTML"
    }
]

# ==================== Helper Functions ====================
def ensure_directories():
    """Create all required directories if they don't exist."""
    for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]:
        directory.mkdir(parents=True, exist_ok=True)

def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int:
    """Get the context length for a given model."""
    context_lengths = {
        "gpt-5": 128000,
        "gpt-4o-mini": 8192,
        "gpt-4o": 128000,
    }
    return context_lengths.get(model_name, 4096)

def validate_api_key():
    """Check if OpenAI API key is set."""
    if not OPENAI_API_KEY:
        raise ValueError(
            "OpenAI API key not found. Please set OPENAI_API_KEY in .env file."
        )
    return True

# ==================== System Info ====================
def print_config():
    """Print current configuration for debugging."""
    print("="*50)
    print("RAG System Configuration")
    print("="*50)
    print(f"OpenAI Model: {OPENAI_CHAT_MODEL}")
    print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}")
    print(f"Device: {DEVICE}")
    print(f"Default Temperature: {DEFAULT_TEMPERATURE}")
    print(f"Default Top-K: {DEFAULT_TOP_K}")
    print(f"Chunk Size: {CHUNK_SIZE}")
    print(f"Project Root: {PROJECT_ROOT}")
    print("="*50)

# Ensure directories exist on import
ensure_directories()