Spaces:
Paused
Paused
| """ | |
| Vision-based query module using GPT-5 Vision. | |
| Supports multimodal queries combining text and images. | |
| """ | |
| import base64 | |
| import json | |
| import logging | |
| import sqlite3 | |
| from typing import List, Tuple, Optional, Dict, Any | |
| import numpy as np | |
| from PIL import Image | |
| from openai import OpenAI | |
| from config import * | |
| from utils import ImageProcessor, classify_image | |
| logger = logging.getLogger(__name__) | |
| class VisionRetriever: | |
| """Vision-based retrieval using GPT-5 Vision for image analysis and classification.""" | |
| def __init__(self): | |
| self.client = OpenAI(api_key=OPENAI_API_KEY) | |
| self.image_processor = ImageProcessor() | |
| def get_similar_images(self, query_image_path: str, top_k: int = 5) -> List[Dict[str, Any]]: | |
| """Find similar images in the database based on classification similarity.""" | |
| try: | |
| # Uses GPT-5 Vision for classification-based similarity search | |
| # Note: This implementation uses classification similarity rather than embeddings | |
| # Classify the query image | |
| query_classification = classify_image(query_image_path) | |
| # Query database for similar images | |
| conn = sqlite3.connect(IMAGES_DB) | |
| cursor = conn.cursor() | |
| # Search for images with similar classification | |
| cursor.execute(""" | |
| SELECT image_id, image_path, classification, metadata | |
| FROM images | |
| WHERE classification LIKE ? | |
| ORDER BY created_at DESC | |
| LIMIT ? | |
| """, (f"%{query_classification}%", top_k)) | |
| results = cursor.fetchall() | |
| conn.close() | |
| similar_images = [] | |
| for row in results: | |
| image_id, image_path, classification, metadata_json = row | |
| metadata = json.loads(metadata_json) if metadata_json else {} | |
| similar_images.append({ | |
| 'image_id': image_id, | |
| 'image_path': image_path, | |
| 'classification': classification, | |
| 'metadata': metadata, | |
| 'similarity_score': 0.8 # Classification-based similarity score | |
| }) | |
| logger.info(f"Found {len(similar_images)} similar images for query") | |
| return similar_images | |
| except Exception as e: | |
| logger.error(f"Error finding similar images: {e}") | |
| return [] | |
| def analyze_image_safety(self, image_path: str, question: str = None) -> str: | |
| """Analyze image for safety concerns using GPT-5 Vision.""" | |
| try: | |
| # Convert image to base64 | |
| with open(image_path, "rb") as image_file: | |
| image_b64 = base64.b64encode(image_file.read()).decode() | |
| # Create analysis prompt | |
| if question: | |
| analysis_prompt = ( | |
| f"Analyze this image in the context of the following question: {question}\n\n" | |
| "Please provide a detailed safety analysis covering:\n" | |
| "1. What equipment, machinery, or workplace elements are visible\n" | |
| "2. Any potential safety hazards or compliance issues\n" | |
| "3. Relevant OSHA standards or regulations that may apply\n" | |
| "4. Recommendations for safety improvements\n" | |
| "5. How this relates to the specific question asked" | |
| ) | |
| else: | |
| analysis_prompt = ( | |
| "Analyze this image for occupational safety and health concerns. Provide:\n" | |
| "1. Description of what's shown in the image\n" | |
| "2. Identification of potential safety hazards\n" | |
| "3. Relevant OSHA standards or safety regulations\n" | |
| "4. Recommendations for improving safety" | |
| ) | |
| messages = [{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": analysis_prompt}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}", "detail": "high"}} | |
| ] | |
| }] | |
| # For GPT-5 vision, temperature must be default (1.0) and reasoning is not supported | |
| response = self.client.chat.completions.create( | |
| model=OPENAI_CHAT_MODEL, | |
| messages=messages, | |
| max_completion_tokens=DEFAULT_MAX_TOKENS | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| logger.error(f"Error analyzing image: {e}") | |
| return f"I encountered an error while analyzing the image: {e}" | |
| def retrieve_relevant_text(self, image_classification: str, question: str, top_k: int = 3) -> List[Dict[str, Any]]: | |
| """Retrieve text documents relevant to the image classification and question.""" | |
| # This would integrate with other retrieval methods to find relevant text | |
| # For now, we'll create a simple keyword-based search | |
| try: | |
| # Import other query modules for text retrieval | |
| from query_vanilla import query as vanilla_query | |
| # Create an enhanced query combining image classification and original question | |
| enhanced_question = f"safety requirements for {image_classification} {question}" | |
| # Use vanilla retrieval to find relevant text | |
| _, text_citations = vanilla_query(enhanced_question, top_k=top_k) | |
| return text_citations | |
| except Exception as e: | |
| logger.error(f"Error retrieving relevant text: {e}") | |
| return [] | |
| def generate_multimodal_answer(self, question: str, image_analysis: str, | |
| text_citations: List[Dict], similar_images: List[Dict]) -> str: | |
| """Generate answer combining image analysis and text retrieval.""" | |
| try: | |
| # Prepare context from text citations | |
| text_context = "" | |
| if text_citations: | |
| text_parts = [] | |
| for i, citation in enumerate(text_citations, 1): | |
| if 'text' in citation: | |
| text_parts.append(f"[Text Source {i}] {citation['source']}: {citation['text'][:500]}...") | |
| else: | |
| text_parts.append(f"[Text Source {i}] {citation['source']}") | |
| text_context = "\n\n".join(text_parts) | |
| # Prepare context from similar images | |
| image_context = "" | |
| if similar_images: | |
| image_parts = [] | |
| for img in similar_images[:3]: # Limit to top 3 | |
| source = img['metadata'].get('source', 'Unknown') | |
| classification = img.get('classification', 'unknown') | |
| image_parts.append(f"Similar image from {source}: classified as {classification}") | |
| image_context = "\n".join(image_parts) | |
| # Create comprehensive prompt | |
| system_message = ( | |
| "You are an expert in occupational safety and health. " | |
| "You have been provided with an image analysis, relevant text documents, " | |
| "and information about similar images in the database. " | |
| "Provide a comprehensive answer that integrates all this information." | |
| ) | |
| user_message = f"""Question: {question} | |
| Image Analysis: | |
| {image_analysis} | |
| Relevant Text Documentation: | |
| {text_context} | |
| Similar Images Context: | |
| {image_context} | |
| Please provide a comprehensive answer that: | |
| 1. Addresses the specific question asked | |
| 2. Incorporates insights from the image analysis | |
| 3. References relevant regulatory information from the text sources | |
| 4. Notes any connections to similar cases or images | |
| 5. Provides actionable recommendations based on safety standards""" | |
| # For GPT-5, temperature must be default (1.0) and reasoning is not supported | |
| response = self.client.chat.completions.create( | |
| model=OPENAI_CHAT_MODEL, | |
| messages=[ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": user_message} | |
| ], | |
| max_completion_tokens=DEFAULT_MAX_TOKENS * 2 # Allow longer response for comprehensive analysis | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| logger.error(f"Error generating multimodal answer: {e}") | |
| return "I apologize, but I encountered an error while generating the comprehensive answer." | |
| # Global retriever instance | |
| _retriever = None | |
| def get_retriever() -> VisionRetriever: | |
| """Get or create global vision retriever instance.""" | |
| global _retriever | |
| if _retriever is None: | |
| _retriever = VisionRetriever() | |
| return _retriever | |
| def query(question: str, image_path: Optional[str] = None, top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict]]: | |
| """ | |
| Main vision-based query function with unified signature. | |
| Args: | |
| question: User question | |
| image_path: Path to image file (required for vision queries) | |
| top_k: Number of relevant results to retrieve | |
| Returns: | |
| Tuple of (answer, citations) | |
| """ | |
| if not image_path: | |
| return "Vision queries require an image. Please provide an image file.", [] | |
| try: | |
| retriever = get_retriever() | |
| # Step 1: Analyze the provided image | |
| logger.info(f"Analyzing image: {image_path}") | |
| image_analysis = retriever.analyze_image_safety(image_path, question) | |
| # Step 2: Classify the image | |
| image_classification = classify_image(image_path) | |
| # Step 3: Find similar images | |
| similar_images = retriever.get_similar_images(image_path, top_k=3) | |
| # Step 4: Retrieve relevant text documents | |
| text_citations = retriever.retrieve_relevant_text(image_classification, question, top_k) | |
| # Step 5: Generate comprehensive multimodal answer | |
| answer = retriever.generate_multimodal_answer( | |
| question, image_analysis, text_citations, similar_images | |
| ) | |
| # Step 6: Prepare citations | |
| citations = [] | |
| # Add image analysis as primary citation | |
| citations.append({ | |
| 'rank': 1, | |
| 'type': 'image_analysis', | |
| 'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}", | |
| 'method': 'vision', | |
| 'classification': image_classification, | |
| 'score': 1.0 | |
| }) | |
| # Add text citations | |
| for i, citation in enumerate(text_citations, 2): | |
| citation_copy = citation.copy() | |
| citation_copy['rank'] = i | |
| citation_copy['method'] = 'vision_text' | |
| citations.append(citation_copy) | |
| # Add similar images | |
| for i, img in enumerate(similar_images): | |
| citations.append({ | |
| 'rank': len(citations) + 1, | |
| 'type': 'similar_image', | |
| 'source': img['metadata'].get('source', 'Image Database'), | |
| 'method': 'vision', | |
| 'classification': img.get('classification', 'unknown'), | |
| 'similarity_score': img.get('similarity_score', 0.0), | |
| 'image_id': img.get('image_id') | |
| }) | |
| logger.info(f"Vision query completed. Generated {len(citations)} citations.") | |
| return answer, citations | |
| except Exception as e: | |
| logger.error(f"Error in vision query: {e}") | |
| error_message = "I apologize, but I encountered an error while processing your vision-based question." | |
| return error_message, [] | |
| def query_image_only(image_path: str, question: str = None) -> Tuple[str, List[Dict]]: | |
| """ | |
| Analyze image without text retrieval (faster for simple image analysis). | |
| Args: | |
| image_path: Path to image file | |
| question: Optional specific question about the image | |
| Returns: | |
| Tuple of (analysis, citations) | |
| """ | |
| try: | |
| retriever = get_retriever() | |
| # Analyze image | |
| analysis = retriever.analyze_image_safety(image_path, question) | |
| # Classify image | |
| classification = classify_image(image_path) | |
| # Create citation for image analysis | |
| citations = [{ | |
| 'rank': 1, | |
| 'type': 'image_analysis', | |
| 'source': f"Analysis of {image_path.split('/')[-1] if '/' in image_path else image_path.split('\\')[-1]}", | |
| 'method': 'vision_only', | |
| 'classification': classification, | |
| 'score': 1.0 | |
| }] | |
| return analysis, citations | |
| except Exception as e: | |
| logger.error(f"Error in image-only analysis: {e}") | |
| return "Error analyzing image.", [] | |
| def query_with_details(question: str, image_path: Optional[str] = None, | |
| top_k: int = DEFAULT_TOP_K) -> Tuple[str, List[Dict], List[Tuple]]: | |
| """ | |
| Vision query function that returns detailed chunk information (for compatibility). | |
| Returns: | |
| Tuple of (answer, citations, chunks) | |
| """ | |
| answer, citations = query(question, image_path, top_k) | |
| # Convert citations to chunk format for backward compatibility | |
| chunks = [] | |
| for citation in citations: | |
| if citation['type'] == 'image_analysis': | |
| chunks.append(( | |
| f"Image Analysis ({citation['classification']})", | |
| citation['score'], | |
| "Analysis of uploaded image for safety compliance", | |
| citation['source'] | |
| )) | |
| elif citation['type'] == 'similar_image': | |
| chunks.append(( | |
| f"Similar Image (Score: {citation.get('similarity_score', 0):.3f})", | |
| citation.get('similarity_score', 0), | |
| f"Similar image classified as {citation['classification']}", | |
| citation['source'] | |
| )) | |
| else: | |
| chunks.append(( | |
| f"Text Reference {citation['rank']}", | |
| citation.get('score', 0.5), | |
| citation.get('text', 'Referenced document'), | |
| citation['source'] | |
| )) | |
| return answer, citations, chunks | |
| if __name__ == "__main__": | |
| # Test the vision system (requires an actual image file) | |
| import sys | |
| if len(sys.argv) > 1: | |
| test_image_path = sys.argv[1] | |
| test_question = "What safety issues can you identify in this image?" | |
| print("Testing vision retrieval system...") | |
| print(f"Image: {test_image_path}") | |
| print(f"Question: {test_question}") | |
| print("-" * 50) | |
| try: | |
| answer, citations = query(test_question, test_image_path) | |
| print("Answer:") | |
| print(answer) | |
| print(f"\nCitations ({len(citations)}):") | |
| for citation in citations: | |
| print(f"- {citation['source']} (Type: {citation.get('type', 'unknown')})") | |
| except Exception as e: | |
| print(f"Error during testing: {e}") | |
| else: | |
| print("To test vision system, provide an image path as argument:") | |
| print("python query_vision.py /path/to/image.jpg") |