Spaces:

fmegahed
/

sight_chat

Paused

App Files Files Community

sight_chat / query_vanilla.py

fmegahed

version 2.0.0

ef821d9 verified 4 months ago

raw

history blame

6.73 kB

	"""
	Vanilla vector search using FAISS index and OpenAI embeddings.
	"""

	import numpy as np
	import faiss
	from typing import Tuple, List, Optional
	from openai import OpenAI

	import pickle
	import logging
	from config import *
	from utils import EmbeddingGenerator, classify_image

	logger = logging.getLogger(__name__)

	# Initialize OpenAI client
	client = OpenAI(api_key=OPENAI_API_KEY)

	# Global variables for lazy loading
	_index = None
	_texts = None
	_metadata = None

	def _load_vanilla_index():
	"""Lazy load vanilla FAISS index and metadata."""
	global _index, _texts, _metadata

	if _index is None:
	try:
	if VANILLA_FAISS_INDEX.exists() and VANILLA_METADATA.exists():
	logger.info("Loading vanilla FAISS index...")

	# Load FAISS index
	_index = faiss.read_index(str(VANILLA_FAISS_INDEX))

	# Load metadata
	with open(VANILLA_METADATA, 'rb') as f:
	data = pickle.load(f)

	if isinstance(data, list):
	# New format with metadata list
	_texts = [item['text'] for item in data]
	_metadata = [item['metadata'] for item in data]
	else:
	# Old format with dict
	_texts = data.get('texts', [])
	_metadata = data.get('metadata', [])

	logger.info(f"✓ Loaded vanilla index with {len(_texts)} documents")
	else:
	logger.warning("Vanilla index not found. Run preprocess.py first.")
	_index = None
	_texts = []
	_metadata = []

	except Exception as e:
	logger.error(f"Error loading vanilla index: {e}")
	_index = None
	_texts = []
	_metadata = []


	def query(question: str, image_path: Optional[str] = None, top_k: int = None) -> Tuple[str, List[dict]]:
	"""
	Query using vanilla vector search.

	Args:
	question: User's question
	image_path: Optional path to an image (for multimodal queries)
	top_k: Number of relevant chunks to retrieve

	Returns:
	Tuple of (answer, citations)
	"""
	if top_k is None:
	top_k = DEFAULT_TOP_K

	# Load index if not already loaded
	_load_vanilla_index()

	if _index is None or len(_texts) == 0:
	return "Index not loaded. Please run preprocess.py first.", []

	# Generate query embedding using embedding generator
	embedding_gen = EmbeddingGenerator()
	query_embedding = embedding_gen.embed_text_openai([question])

	# Normalize for cosine similarity
	query_embedding = query_embedding.astype(np.float32)
	faiss.normalize_L2(query_embedding)

	# Search the index
	distances, indices = _index.search(query_embedding, top_k)

	# Collect retrieved chunks and citations
	retrieved_chunks = []
	citations = []
	sources_seen = set()

	for idx, distance in zip(indices[0], distances[0]):
	if idx < len(_texts) and distance > MIN_RELEVANCE_SCORE:
	chunk_text = _texts[idx]
	chunk_meta = _metadata[idx]

	retrieved_chunks.append({
	'text': chunk_text,
	'score': float(distance),
	'metadata': chunk_meta
	})

	# Build citation
	if chunk_meta['source'] not in sources_seen:
	citation = {
	'source': chunk_meta['source'],
	'type': chunk_meta['type'],
	'relevance_score': round(float(distance), 3)
	}

	if chunk_meta['type'] == 'pdf':
	citation['path'] = chunk_meta['path']
	else: # HTML
	citation['url'] = chunk_meta.get('url', '')

	citations.append(citation)
	sources_seen.add(chunk_meta['source'])

	# Handle image if provided
	image_context = ""
	if image_path:
	try:
	classification = classify_image(image_path)
	image_context = f"\n\n[Image Context: The provided image appears to be a {classification}.]"
	except Exception as e:
	logger.error(f"Error processing image: {e}")

	# Build context for the prompt
	context = "\n\n---\n\n".join([chunk['text'] for chunk in retrieved_chunks])

	if not context:
	return "No relevant documents found for your query.", []

	# Generate answer using OpenAI
	prompt = f"""Use the following context to answer the question:

	{context}{image_context}

	Question: {question}

	Please provide a comprehensive answer based on the context provided. If the context doesn't contain enough information, say so."""

	# For GPT-5, temperature must be default (1.0)
	response = client.chat.completions.create(
	model=OPENAI_CHAT_MODEL,
	messages=[
	{"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety. Always cite your sources when providing information."},
	{"role": "user", "content": prompt}
	],
	max_completion_tokens=DEFAULT_MAX_TOKENS
	)

	answer = response.choices[0].message.content

	return answer, citations


	def query_with_feedback(question: str, feedback_scores: List[float] = None, top_k: int = 5) -> Tuple[str, List[dict]]:
	"""
	Query with relevance feedback to refine results.

	Args:
	question: User's question
	feedback_scores: Optional relevance scores for previous results
	top_k: Number of relevant chunks to retrieve

	Returns:
	Tuple of (answer, citations)
	"""
	# For now, just use regular query
	# TODO: Implement Rocchio algorithm or similar for relevance feedback
	return query(question, top_k=top_k)


	if __name__ == "__main__":
	# Test the vanilla query
	test_questions = [
	"What are general machine guarding requirements?",
	"How do I perform lockout/tagout procedures?",
	"What safety measures are needed for robotic systems?"
	]

	for q in test_questions:
	print(f"\nQuestion: {q}")
	answer, citations = query(q)
	print(f"Answer: {answer[:200]}...")
	print(f"Citations: {[c['source'] for c in citations]}")
	print("-" * 50)