File size: 8,829 Bytes
ef821d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""

Central configuration file for the Multi-Method RAG System.

All shared parameters and settings are defined here.

"""

import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

# ==================== Versioning and Date ====================
DATE = "August 13, 2025"
VERSION = "2.0.1"


# ==================== API Configuration ====================
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_CHAT_MODEL = "gpt-5-chat-latest"  # This is the non-reasoning model for gpt-5 so it has no latency
OPENAI_EMBEDDING_MODEL = "text-embedding-3-large"  # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002

# ==================== Realtime API Configuration ====================
# OpenAI Realtime API settings for speech-to-speech functionality
OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview"  # Realtime model for speech-to-speech
REALTIME_VOICE = "alloy"  # Available voices: alloy, echo, fable, onyx, nova, shimmer
REALTIME_INSTRUCTIONS = (
    "You are a knowledgeable safety expert speaking naturally in conversation. "
    
    "VOICE BEHAVIOR: "
    "- Speak like a confident safety professional talking to a colleague "
    "- Acknowledge what you heard: 'You're asking about [topic]...' "
    "- Use natural speech with appropriate pauses and emphasis "
    "- Sound authoritative and knowledgeable - you ARE the expert "
    "- Never mention document names, page numbers, or citation details when speaking "
    "- Just state the facts naturally as if you know them from your expertise "
    
    "RESPONSE PROCESS: "
    "1. Briefly acknowledge the question: 'You're asking about [topic]...' "
    "2. Call ask_rag to get the accurate information "
    "3. Speak the information naturally as YOUR expertise, not as 'according to document X' "
    "4. Organize complex topics: 'There are three key requirements here...' "
    "5. Be thorough but conversational - like explaining to a colleague "
    
    "CITATION RULE: "
    "NEVER mention specific documents, sources, or page numbers in speech. "
    "Just state the information confidently as if it's your professional knowledge. "
    "For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' "
    
    "IMPORTANT: Always use ask_rag for safety questions to get accurate information, "
    "but speak the results as your own expertise, not as citations."
)

# ==================== Model Parameters ====================
# Generation parameters
DEFAULT_TEMPERATURE = 0  # Range: 0.0-1.0 (0=deterministic, 1=creative)
DEFAULT_MAX_TOKENS = 5000  # Maximum tokens in response
DEFAULT_TOP_K = 5  # Number of chunks to retrieve by default
DEFAULT_TOP_P = 1.0  # Nucleus sampling parameter

# Context window management
MAX_CONTEXT_TOKENS = 7500  # Maximum context for models with 8k window
CHUNK_SIZE = 2000  # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens)
CHUNK_OVERLAP = 200  # Token overlap between chunks

# ==================== Embedding Models ====================
# Sentence Transformers models
SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'  # For DPR
CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'  # For re-ranking

# CLIP model
CLIP_MODEL = "ViT-L/14"  # Options: ViT-B/32, ViT-L/14, RN50

# ==================== Search Parameters ====================
# BM25 parameters
BM25_K1 = 1.5  # Term frequency saturation parameter
BM25_B = 0.75  # Length normalization parameter

# Hybrid search
DEFAULT_HYBRID_ALPHA = 0.5  # Weight for BM25 (1-alpha for semantic)

# Re-ranking
RERANK_MULTIPLIER = 2  # Retrieve this many times top_k for re-ranking
MIN_RELEVANCE_SCORE = 0.3  # Minimum score threshold

# ==================== Directory Structure ====================
# Project directories
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data"
EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings"
GRAPH_DIR = PROJECT_ROOT / "graph"
METADATA_DIR = PROJECT_ROOT / "metadata"
IMAGES_DIR = DATA_DIR / "images"

# File paths
VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index"
VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl"
DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index"
DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl"
BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl"
CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl"
GRAPH_FILE = GRAPH_DIR / "graph.gml"
IMAGES_DB = METADATA_DIR / "images.db"
CHROMA_PATH = EMBEDDINGS_DIR / "chroma"

# ==================== Batch Processing ====================
EMBEDDING_BATCH_SIZE = 100  # Batch size for OpenAI embeddings
PROCESSING_BATCH_SIZE = 50  # Documents to process at once

# ==================== UI Configuration ====================
# Streamlit settings
MAX_CHAT_HISTORY = 5  # Maximum chat messages to keep
EXAMPLE_QUESTIONS = [
    "What are general machine guarding requirements?",
    "How do I perform lockout/tagout?",
    "What safety measures are needed for robotic systems?",
    "Explain the difference between guards and devices in machine safety.",
    "What are the OSHA requirements for emergency stops?",
]

# Default method
DEFAULT_METHOD = "graph"

# Method descriptions for UI
METHOD_DESCRIPTIONS = {
    'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
    'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
    'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
    'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
    'context': "Context stuffing with full document loading and heuristic selection",
    'vision': "Vision-based search using GPT-5 Vision for image analysis and classification"
}

# ==================== Document Processing ====================
# Document types
SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html']
IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']

# Text splitting
MARKDOWN_HEADER_LEVEL = 3  # Split by this header level (###)
MAX_SECTIONS_PER_DOC = 500  # Maximum sections to extract from a document

# ==================== Logging ====================
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")  # DEBUG, INFO, WARNING, ERROR
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

# ==================== Performance ====================
# Device configuration
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = 4  # Parallel processing workers

# Cache settings
ENABLE_CACHE = True
CACHE_TTL = 3600  # Cache time-to-live in seconds

# ==================== Safety & Validation ====================
# Input validation
MAX_QUESTION_LENGTH = 1000  # Maximum characters in a question
MAX_IMAGE_SIZE_MB = 10  # Maximum image file size

# Rate limiting (if needed)
RATE_LIMIT_ENABLED = False
MAX_QUERIES_PER_MINUTE = 60

# ==================== Default HTML Sources ====================
DEFAULT_HTML_SOURCES = [
    {
        "title": "NIOSH Robotics in the Workplace – Safety Overview",
        "url": "https://www.cdc.gov/niosh/robotics/about/",
        "source": "NIOSH",
        "year": 2024,
        "category": "Technical Guide",
        "format": "HTML"
    }
]

# ==================== Helper Functions ====================
def ensure_directories():
    """Create all required directories if they don't exist."""
    for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]:
        directory.mkdir(parents=True, exist_ok=True)

def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int:
    """Get the context length for a given model."""
    context_lengths = {
        "gpt-5": 128000,
        "gpt-4o-mini": 8192,
        "gpt-4o": 128000,
    }
    return context_lengths.get(model_name, 4096)

def validate_api_key():
    """Check if OpenAI API key is set."""
    if not OPENAI_API_KEY:
        raise ValueError(
            "OpenAI API key not found. Please set OPENAI_API_KEY in .env file."
        )
    return True

# ==================== System Info ====================
def print_config():
    """Print current configuration for debugging."""
    print("="*50)
    print("RAG System Configuration")
    print("="*50)
    print(f"OpenAI Model: {OPENAI_CHAT_MODEL}")
    print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}")
    print(f"Device: {DEVICE}")
    print(f"Default Temperature: {DEFAULT_TEMPERATURE}")
    print(f"Default Top-K: {DEFAULT_TOP_K}")
    print(f"Chunk Size: {CHUNK_SIZE}")
    print(f"Project Root: {PROJECT_ROOT}")
    print("="*50)

# Ensure directories exist on import
ensure_directories()