Spaces:

fmegahed
/

sight_chat

Paused

App Files Files Community

sight_chat / config.py

fmegahed

version 2.0.0

ef821d9 verified 4 months ago

raw

history blame

8.83 kB

	"""
	Central configuration file for the Multi-Method RAG System.
	All shared parameters and settings are defined here.
	"""

	import os
	from pathlib import Path
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv(override=True)

	# ==================== Versioning and Date ====================
	DATE = "August 13, 2025"
	VERSION = "2.0.1"


	# ==================== API Configuration ====================
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	OPENAI_CHAT_MODEL = "gpt-5-chat-latest" # This is the non-reasoning model for gpt-5 so it has no latency
	OPENAI_EMBEDDING_MODEL = "text-embedding-3-large" # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002

	# ==================== Realtime API Configuration ====================
	# OpenAI Realtime API settings for speech-to-speech functionality
	OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview" # Realtime model for speech-to-speech
	REALTIME_VOICE = "alloy" # Available voices: alloy, echo, fable, onyx, nova, shimmer
	REALTIME_INSTRUCTIONS = (
	"You are a knowledgeable safety expert speaking naturally in conversation. "

	"VOICE BEHAVIOR: "
	"- Speak like a confident safety professional talking to a colleague "
	"- Acknowledge what you heard: 'You're asking about [topic]...' "
	"- Use natural speech with appropriate pauses and emphasis "
	"- Sound authoritative and knowledgeable - you ARE the expert "
	"- Never mention document names, page numbers, or citation details when speaking "
	"- Just state the facts naturally as if you know them from your expertise "

	"RESPONSE PROCESS: "
	"1. Briefly acknowledge the question: 'You're asking about [topic]...' "
	"2. Call ask_rag to get the accurate information "
	"3. Speak the information naturally as YOUR expertise, not as 'according to document X' "
	"4. Organize complex topics: 'There are three key requirements here...' "
	"5. Be thorough but conversational - like explaining to a colleague "

	"CITATION RULE: "
	"NEVER mention specific documents, sources, or page numbers in speech. "
	"Just state the information confidently as if it's your professional knowledge. "
	"For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' "

	"IMPORTANT: Always use ask_rag for safety questions to get accurate information, "
	"but speak the results as your own expertise, not as citations."
	)

	# ==================== Model Parameters ====================
	# Generation parameters
	DEFAULT_TEMPERATURE = 0 # Range: 0.0-1.0 (0=deterministic, 1=creative)
	DEFAULT_MAX_TOKENS = 5000 # Maximum tokens in response
	DEFAULT_TOP_K = 5 # Number of chunks to retrieve by default
	DEFAULT_TOP_P = 1.0 # Nucleus sampling parameter

	# Context window management
	MAX_CONTEXT_TOKENS = 7500 # Maximum context for models with 8k window
	CHUNK_SIZE = 2000 # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens)
	CHUNK_OVERLAP = 200 # Token overlap between chunks

	# ==================== Embedding Models ====================
	# Sentence Transformers models
	SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2' # For DPR
	CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # For re-ranking

	# CLIP model
	CLIP_MODEL = "ViT-L/14" # Options: ViT-B/32, ViT-L/14, RN50

	# ==================== Search Parameters ====================
	# BM25 parameters
	BM25_K1 = 1.5 # Term frequency saturation parameter
	BM25_B = 0.75 # Length normalization parameter

	# Hybrid search
	DEFAULT_HYBRID_ALPHA = 0.5 # Weight for BM25 (1-alpha for semantic)

	# Re-ranking
	RERANK_MULTIPLIER = 2 # Retrieve this many times top_k for re-ranking
	MIN_RELEVANCE_SCORE = 0.3 # Minimum score threshold

	# ==================== Directory Structure ====================
	# Project directories
	PROJECT_ROOT = Path(__file__).parent
	DATA_DIR = PROJECT_ROOT / "data"
	EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings"
	GRAPH_DIR = PROJECT_ROOT / "graph"
	METADATA_DIR = PROJECT_ROOT / "metadata"
	IMAGES_DIR = DATA_DIR / "images"

	# File paths
	VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index"
	VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl"
	DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index"
	DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl"
	BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl"
	CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl"
	GRAPH_FILE = GRAPH_DIR / "graph.gml"
	IMAGES_DB = METADATA_DIR / "images.db"
	CHROMA_PATH = EMBEDDINGS_DIR / "chroma"

	# ==================== Batch Processing ====================
	EMBEDDING_BATCH_SIZE = 100 # Batch size for OpenAI embeddings
	PROCESSING_BATCH_SIZE = 50 # Documents to process at once

	# ==================== UI Configuration ====================
	# Streamlit settings
	MAX_CHAT_HISTORY = 5 # Maximum chat messages to keep
	EXAMPLE_QUESTIONS = [
	"What are general machine guarding requirements?",
	"How do I perform lockout/tagout?",
	"What safety measures are needed for robotic systems?",
	"Explain the difference between guards and devices in machine safety.",
	"What are the OSHA requirements for emergency stops?",
	]

	# Default method
	DEFAULT_METHOD = "graph"

	# Method descriptions for UI
	METHOD_DESCRIPTIONS = {
	'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval",
	'vanilla': "Standard vector search with FAISS and OpenAI embeddings",
	'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking",
	'bm25': "BM25 keyword search with neural re-ranking for exact term matching",
	'context': "Context stuffing with full document loading and heuristic selection",
	'vision': "Vision-based search using GPT-5 Vision for image analysis and classification"
	}

	# ==================== Document Processing ====================
	# Document types
	SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html']
	IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']

	# Text splitting
	MARKDOWN_HEADER_LEVEL = 3 # Split by this header level (###)
	MAX_SECTIONS_PER_DOC = 500 # Maximum sections to extract from a document

	# ==================== Logging ====================
	LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") # DEBUG, INFO, WARNING, ERROR
	LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

	# ==================== Performance ====================
	# Device configuration
	import torch
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	NUM_WORKERS = 4 # Parallel processing workers

	# Cache settings
	ENABLE_CACHE = True
	CACHE_TTL = 3600 # Cache time-to-live in seconds

	# ==================== Safety & Validation ====================
	# Input validation
	MAX_QUESTION_LENGTH = 1000 # Maximum characters in a question
	MAX_IMAGE_SIZE_MB = 10 # Maximum image file size

	# Rate limiting (if needed)
	RATE_LIMIT_ENABLED = False
	MAX_QUERIES_PER_MINUTE = 60

	# ==================== Default HTML Sources ====================
	DEFAULT_HTML_SOURCES = [
	{
	"title": "NIOSH Robotics in the Workplace – Safety Overview",
	"url": "https://www.cdc.gov/niosh/robotics/about/",
	"source": "NIOSH",
	"year": 2024,
	"category": "Technical Guide",
	"format": "HTML"
	}
	]

	# ==================== Helper Functions ====================
	def ensure_directories():
	"""Create all required directories if they don't exist."""
	for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]:
	directory.mkdir(parents=True, exist_ok=True)

	def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int:
	"""Get the context length for a given model."""
	context_lengths = {
	"gpt-5": 128000,
	"gpt-4o-mini": 8192,
	"gpt-4o": 128000,
	}
	return context_lengths.get(model_name, 4096)

	def validate_api_key():
	"""Check if OpenAI API key is set."""
	if not OPENAI_API_KEY:
	raise ValueError(
	"OpenAI API key not found. Please set OPENAI_API_KEY in .env file."
	)
	return True

	# ==================== System Info ====================
	def print_config():
	"""Print current configuration for debugging."""
	print("="*50)
	print("RAG System Configuration")
	print("="*50)
	print(f"OpenAI Model: {OPENAI_CHAT_MODEL}")
	print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}")
	print(f"Device: {DEVICE}")
	print(f"Default Temperature: {DEFAULT_TEMPERATURE}")
	print(f"Default Top-K: {DEFAULT_TOP_K}")
	print(f"Chunk Size: {CHUNK_SIZE}")
	print(f"Project Root: {PROJECT_ROOT}")
	print("="*50)

	# Ensure directories exist on import
	ensure_directories()