Spaces:
Paused
Paused
File size: 8,829 Bytes
ef821d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
"""
Central configuration file for the Multi-Method RAG System.
All shared parameters and settings are defined here.
"""
import os
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables
load_dotenv(override=True)
# ==================== Versioning and Date ====================
DATE = "August 13, 2025"
VERSION = "2.0.1"
# ==================== API Configuration ====================
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_CHAT_MODEL = "gpt-5-chat-latest" # This is the non-reasoning model for gpt-5 so it has no latency
OPENAI_EMBEDDING_MODEL = "text-embedding-3-large" # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002
# ==================== Realtime API Configuration ====================
# OpenAI Realtime API settings for speech-to-speech functionality
OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview" # Realtime model for speech-to-speech
REALTIME_VOICE = "alloy" # Available voices: alloy, echo, fable, onyx, nova, shimmer
REALTIME_INSTRUCTIONS = (
"You are a knowledgeable safety expert speaking naturally in conversation. "
"VOICE BEHAVIOR: "
"- Speak like a confident safety professional talking to a colleague "
"- Acknowledge what you heard: 'You're asking about [topic]...' "
"- Use natural speech with appropriate pauses and emphasis "
"- Sound authoritative and knowledgeable - you ARE the expert "
"- Never mention document names, page numbers, or citation details when speaking "
"- Just state the facts naturally as if you know them from your expertise "
"RESPONSE PROCESS: "
"1. Briefly acknowledge the question: 'You're asking about [topic]...' "
"2. Call ask_rag to get the accurate information "
"3. Speak the information naturally as YOUR expertise, not as 'according to document X' "
"4. Organize complex topics: 'There are three key requirements here...' "
"5. Be thorough but conversational - like explaining to a colleague "
"CITATION RULE: "
"NEVER mention specific documents, sources, or page numbers in speech. "
"Just state the information confidently as if it's your professional knowledge. "
"For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' "
"IMPORTANT: Always use ask_rag for safety questions to get accurate information, "
"but speak the results as your own expertise, not as citations."
)
# ==================== Model Parameters ====================
# Generation parameters
DEFAULT_TEMPERATURE = 0 # Range: 0.0-1.0 (0=deterministic, 1=creative)
DEFAULT_MAX_TOKENS = 5000 # Maximum tokens in response
DEFAULT_TOP_K = 5 # Number of chunks to retrieve by default
DEFAULT_TOP_P = 1.0 # Nucleus sampling parameter
# Context window management
MAX_CONTEXT_TOKENS = 7500 # Maximum context for models with 8k window
CHUNK_SIZE = 2000 # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens)
CHUNK_OVERLAP = 200 # Token overlap between chunks
# ==================== Embedding Models ====================
# Sentence Transformers models
SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2' # For DPR
CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # For re-ranking
# CLIP model
CLIP_MODEL = "ViT-L/14" # Options: ViT-B/32, ViT-L/14, RN50
# ==================== Search Parameters ====================
# BM25 parameters
BM25_K1 = 1.5 # Term frequency saturation parameter
BM25_B = 0.75 # Length normalization parameter
# Hybrid search
DEFAULT_HYBRID_ALPHA = 0.5 # Weight for BM25 (1-alpha for semantic)
# Re-ranking
RERANK_MULTIPLIER = 2 # Retrieve this many times top_k for re-ranking
MIN_RELEVANCE_SCORE = 0.3 # Minimum score threshold
# ==================== Directory Structure ====================
# Project directories
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data"
EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings"
GRAPH_DIR = PROJECT_ROOT / "graph"
METADATA_DIR = PROJECT_ROOT / "metadata"
IMAGES_DIR = DATA_DIR / "images"
# File paths
VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index"
VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl"
DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index"
DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl"
BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl"
CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl"
GRAPH_FILE = GRAPH_DIR / "graph.gml"
IMAGES_DB = METADATA_DIR / "images.db"
CHROMA_PATH = EMBEDDINGS_DIR / "chroma"
# ==================== Batch Processing ====================
EMBEDDING_BATCH_SIZE = 100 # Batch size for OpenAI embeddings
PROCESSING_BATCH_SIZE = 50 # Documents to process at once
# ==================== UI Configuration ====================
# Streamlit settings
MAX_CHAT_HISTORY = 5 # Maximum chat messages to keep
EXAMPLE_QUESTIONS = [
"What are general machine guarding requirements?",
"How do I perform lockout/tagout?",
"What safety measures are needed for robotic systems?",
"Explain the difference between guards and devices in machine safety.",
"What are the OSHA requirements for emergency stops?",
]
# Default method
DEFAULT_METHOD = "graph"
# Method descriptions for UI
METHOD_DESCRIPTIONS = {
'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
'context': "Context stuffing with full document loading and heuristic selection",
'vision': "Vision-based search using GPT-5 Vision for image analysis and classification"
}
# ==================== Document Processing ====================
# Document types
SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html']
IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']
# Text splitting
MARKDOWN_HEADER_LEVEL = 3 # Split by this header level (###)
MAX_SECTIONS_PER_DOC = 500 # Maximum sections to extract from a document
# ==================== Logging ====================
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") # DEBUG, INFO, WARNING, ERROR
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# ==================== Performance ====================
# Device configuration
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = 4 # Parallel processing workers
# Cache settings
ENABLE_CACHE = True
CACHE_TTL = 3600 # Cache time-to-live in seconds
# ==================== Safety & Validation ====================
# Input validation
MAX_QUESTION_LENGTH = 1000 # Maximum characters in a question
MAX_IMAGE_SIZE_MB = 10 # Maximum image file size
# Rate limiting (if needed)
RATE_LIMIT_ENABLED = False
MAX_QUERIES_PER_MINUTE = 60
# ==================== Default HTML Sources ====================
DEFAULT_HTML_SOURCES = [
{
"title": "NIOSH Robotics in the Workplace – Safety Overview",
"url": "https://www.cdc.gov/niosh/robotics/about/",
"source": "NIOSH",
"year": 2024,
"category": "Technical Guide",
"format": "HTML"
}
]
# ==================== Helper Functions ====================
def ensure_directories():
"""Create all required directories if they don't exist."""
for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]:
directory.mkdir(parents=True, exist_ok=True)
def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int:
"""Get the context length for a given model."""
context_lengths = {
"gpt-5": 128000,
"gpt-4o-mini": 8192,
"gpt-4o": 128000,
}
return context_lengths.get(model_name, 4096)
def validate_api_key():
"""Check if OpenAI API key is set."""
if not OPENAI_API_KEY:
raise ValueError(
"OpenAI API key not found. Please set OPENAI_API_KEY in .env file."
)
return True
# ==================== System Info ====================
def print_config():
"""Print current configuration for debugging."""
print("="*50)
print("RAG System Configuration")
print("="*50)
print(f"OpenAI Model: {OPENAI_CHAT_MODEL}")
print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}")
print(f"Device: {DEVICE}")
print(f"Default Temperature: {DEFAULT_TEMPERATURE}")
print(f"Default Top-K: {DEFAULT_TOP_K}")
print(f"Chunk Size: {CHUNK_SIZE}")
print(f"Project Root: {PROJECT_ROOT}")
print("="*50)
# Ensure directories exist on import
ensure_directories() |