Spaces:

jesusvilela
/

DearDreadyUnit4

Sleeping

App Files Files Community

jesusvilela commited on Jun 2

Commit

375677d

verified ·

1 Parent(s): f74bbff

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -45

app.py CHANGED Viewed

@@ -38,18 +38,14 @@ except ImportError: PIL_TESSERACT_AVAILABLE = False; print("WARNING: Pillow or P
 try: import whisper; WHISPER_AVAILABLE = True
 except ImportError: WHISPER_AVAILABLE = False; print("WARNING: OpenAI Whisper not found, Audio Transcription tool will be disabled.")
-# --- google-genai SDK (Unified SDK) ---
-from google import genai as google_genai_sdk # Alias for clarity
-from google.genai.types import HarmCategory, HarmBlockThreshold # ***** CORRECTED IMPORT *****
-# For FileState enum later
-from google.ai import generativelanguage as glm
-# --- End google-genai SDK ---
 # LangChain
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
 from langchain.prompts import PromptTemplate
-from langchain.tools import BaseTool, tool as lc_tool_decorator
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain.agents import AgentExecutor, create_react_agent
 from langchain_community.tools import DuckDuckGoSearchRun
@@ -75,7 +71,9 @@ TOOLS: List[BaseTool] = []
 LLM_INSTANCE: Optional[ChatGoogleGenerativeAI] = None
 LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
-google_genai_client: Optional[google_genai_sdk.Client] = None # For direct SDK calls
 try:
     from langgraph.graph import StateGraph, END
@@ -105,7 +103,7 @@ try:
         LG_StateGraph, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 5
         print(f"WARNING: No suitable LangGraph tool executor (ToolNode/ToolExecutor) found. LangGraph agent will be disabled.")
-except ImportError as e: # Catch import error for StateGraph, END itself
     LANGGRAPH_FLAVOR_AVAILABLE = False
     LG_StateGraph, LG_ToolExecutor_Class, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 6
     print(f"WARNING: Core LangGraph components (StateGraph, END) not found or import error: {e}. LangGraph agent will be disabled.")
@@ -134,7 +132,7 @@ logger = logging.getLogger(__name__)
 # --- Initialize google-genai Client SDK ---
 if GOOGLE_API_KEY:
     try:
-        google_genai_client = google_genai_sdk.Client(api_key=GOOGLE_API_KEY) # Using the aliased import
         logger.info("google-genai SDK Client initialized successfully.")
     except Exception as e:
         logger.error(f"Failed to initialize google-genai SDK Client: {e}")
@@ -263,10 +261,10 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
         logger.error(f"Download error for {file_url_to_try}: {e}", exc_info=True); return f"Error: {str(e)[:100]}"
 # --- Tool Function Definitions ---
-READ_PDF_TOOL_DESC = "Reads text content from a PDF file. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
-@lc_tool_decorator(description=READ_PDF_TOOL_DESC)
 def read_pdf_tool(action_input_json_str: str) -> str:
-    # ... (Your original read_pdf_tool logic)
     if not PYPDF2_AVAILABLE: return "Error: PyPDF2 not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
     except Exception as e: return f"Error parsing JSON for read_pdf_tool: {e}. Input: {action_input_json_str}"
@@ -286,10 +284,9 @@ def read_pdf_tool(action_input_json_str: str) -> str:
         return text_content[:40000]
     except Exception as e: return f"Error reading PDF '{path}': {e}"
-OCR_IMAGE_TOOL_DESC = "Extracts text from an image using OCR. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."
-@lc_tool_decorator(description=OCR_IMAGE_TOOL_DESC)
 def ocr_image_tool(action_input_json_str: str) -> str:
-    # ... (Your original ocr_image_tool logic)
     if not PIL_TESSERACT_AVAILABLE: return "Error: Pillow/Pytesseract not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
     except Exception as e: return f"Error parsing JSON for ocr_image_tool: {e}. Input: {action_input_json_str}"
@@ -299,10 +296,9 @@ def ocr_image_tool(action_input_json_str: str) -> str:
     try: return pytesseract.image_to_string(Image.open(path))[:40000]
     except Exception as e: return f"Error OCR'ing '{path}': {e}"
-TRANSCRIBE_AUDIO_TOOL_DESC = "Transcribes speech from an audio file (or YouTube URL) to text. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL_OR_YOUTUBE_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns transcript."
-@lc_tool_decorator(description=TRANSCRIBE_AUDIO_TOOL_DESC)
 def transcribe_audio_tool(action_input_json_str: str) -> str:
-    # ... (Your original transcribe_audio_tool logic)
     global WHISPER_MODEL
     if not WHISPER_AVAILABLE: return "Error: Whisper not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
@@ -316,48 +312,37 @@ def transcribe_audio_tool(action_input_json_str: str) -> str:
     try: result = WHISPER_MODEL.transcribe(path, fp16=False); return result["text"][:40000] # type: ignore
     except Exception as e: logger.error(f"Whisper error on '{path}': {e}", exc_info=True); return f"Error transcribing '{path}': {e}"
-DIRECT_MULTIMODAL_GEMINI_TOOL_DESC = (
-    "Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-2.0-flash-exp) "
-    "for tasks like image description, Q&A about the image, or text generation based on the image. "
-    "Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction.\", \"task_id\": \"TASK_ID\" (optional)}'. "
-    "Returns the model's text response."
-)
-@lc_tool_decorator(description=DIRECT_MULTIMODAL_GEMINI_TOOL_DESC)
 def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
-    # ... (Implementation from previous response)
     global google_genai_client
     if not google_genai_client: return "Error: google-genai SDK client not initialized."
-    if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available."
     try:
         data = json.loads(action_input_json_str)
         file_identifier = data.get("file_identifier")
         text_prompt = data.get("text_prompt", "Describe this image.")
         task_id = data.get("task_id")
         if not file_identifier: return "Error: 'file_identifier' for image missing."
-        logger.info(f"Direct Multimodal Tool: Image '{file_identifier}', Prompt '{text_prompt}'")
         local_image_path = _download_file(file_identifier, task_id)
-        if local_image_path.startswith("Error:"): return f"Error downloading for Direct MM Tool: {local_image_path}"
         try:
             pil_image = Image.open(local_image_path)
-        except Exception as e_img_open: return f"Error opening image {local_image_path}: {str(e_img_open)}"
-        # Use the google_genai_client (which is google.genai.Client)
-        # For the client SDK, model names often don't need "models/" prefix if it's a tuned model or specific ID.
-        # If it's a base model, "models/" is usually required. Let's assume GEMINI_FLASH_MULTIMODAL_MODEL_NAME is a direct ID.
-        # However, to be safe with client.models.generate_content, using "models/" is more standard.
         model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
         response = google_genai_client.models.generate_content(
-            model=model_id_for_client,
-            contents=[pil_image, text_prompt]
         )
         logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
         return response.text[:40000]
-    except json.JSONDecodeError as e_json_mm: return f"Error parsing JSON for Direct MM Tool: {str(e_json_mm)}. Input: {action_input_json_str}"
     except Exception as e_tool_mm:
         logger.error(f"Error in direct_multimodal_gemini_tool: {e_tool_mm}", exc_info=True)
         return f"Error executing Direct Multimodal Tool: {str(e_tool_mm)}"
-# --- Agent Prompts (Unchanged) ---
 LANGGRAPH_PROMPT_TEMPLATE_STR = """You are a highly intelligent agent for the GAIA benchmark.
 Your goal is to provide an EXACT MATCH final answer. No conversational text, explanations, or markdown unless explicitly part of the answer.
 TOOLS:
@@ -396,7 +381,8 @@ def initialize_agent_and_tools(force_reinit=False):
     logger.info("Initializing agent and tools...")
     if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY not set for LangChain LLM.")
-    # Using INTEGER VALUES for HarmCategory keys and HarmBlockThreshold enum .value for values.
     llm_safety_settings_corrected_final = {
         HarmCategory.HARM_CATEGORY_HARASSMENT.value: HarmBlockThreshold.BLOCK_NONE.value,
         HarmCategory.HARM_CATEGORY_HATE_SPEECH.value: HarmBlockThreshold.BLOCK_NONE.value,
@@ -409,7 +395,7 @@ def initialize_agent_and_tools(force_reinit=False):
             model=GEMINI_MODEL_NAME,
             google_api_key=GOOGLE_API_KEY,
             temperature=0.0,
-            safety_settings=llm_safety_settings_corrected_final, # USE THE DICTIONARY WITH INT VALUES FOR BOTH
             timeout=120,
             convert_system_message_to_human=True
         )
@@ -640,7 +626,7 @@ with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !
     demo.load(update_ui_on_load_fn_within_context, [], [agent_status_display, missing_secrets_display])
 if __name__ == "__main__":
-    logger.info(f"Application starting up (v7 - Corrected GenAI Types Import)...")
     if not PYPDF2_AVAILABLE: logger.warning("PyPDF2 (PDF tool) NOT AVAILABLE.")
     if not PIL_TESSERACT_AVAILABLE: logger.warning("Pillow/Pytesseract (OCR tool) NOT AVAILABLE.")
     if not WHISPER_AVAILABLE: logger.warning("Whisper (Audio tool) NOT AVAILABLE.")
@@ -668,4 +654,4 @@ if __name__ == "__main__":
     logger.info(f"Space ID: {os.getenv('SPACE_ID', 'Not Set')}")
     logger.info("Gradio Interface launching...")
-    demo.queue().launch(debug=os.getenv("GRADIO_DEBUG","false").lower()=="true", share=False, max_threads=20)

 try: import whisper; WHISPER_AVAILABLE = True
 except ImportError: WHISPER_AVAILABLE = False; print("WARNING: OpenAI Whisper not found, Audio Transcription tool will be disabled.")
+# Google GenAI (Used by LangChain integration AND direct client)
+from google.genai.types import HarmCategory, HarmBlockThreshold # CORRECTED IMPORT
+from google.ai import generativelanguage as glm # For FileState enum
 # LangChain
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
 from langchain.prompts import PromptTemplate
+from langchain.tools import BaseTool, tool as lc_tool_decorator # Use langchain.tools.tool
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain.agents import AgentExecutor, create_react_agent
 from langchain_community.tools import DuckDuckGoSearchRun
 LLM_INSTANCE: Optional[ChatGoogleGenerativeAI] = None
 LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
+# google-genai Client SDK
+from google import genai as google_genai_sdk
+google_genai_client: Optional[google_genai_sdk.Client] = None
 try:
     from langgraph.graph import StateGraph, END
         LG_StateGraph, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 5
         print(f"WARNING: No suitable LangGraph tool executor (ToolNode/ToolExecutor) found. LangGraph agent will be disabled.")
+except ImportError as e:
     LANGGRAPH_FLAVOR_AVAILABLE = False
     LG_StateGraph, LG_ToolExecutor_Class, LG_END, LG_ToolInvocation, add_messages, MemorySaver_Class = (None,) * 6
     print(f"WARNING: Core LangGraph components (StateGraph, END) not found or import error: {e}. LangGraph agent will be disabled.")
 # --- Initialize google-genai Client SDK ---
 if GOOGLE_API_KEY:
     try:
+        google_genai_client = google_genai_sdk.Client(api_key=GOOGLE_API_KEY)
         logger.info("google-genai SDK Client initialized successfully.")
     except Exception as e:
         logger.error(f"Failed to initialize google-genai SDK Client: {e}")
         logger.error(f"Download error for {file_url_to_try}: {e}", exc_info=True); return f"Error: {str(e)[:100]}"
 # --- Tool Function Definitions ---
+# Corrected: Removed 'description' from @lc_tool_decorator, use docstring
+@lc_tool_decorator
 def read_pdf_tool(action_input_json_str: str) -> str:
+    """Reads text content from a PDF file. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."""
     if not PYPDF2_AVAILABLE: return "Error: PyPDF2 not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
     except Exception as e: return f"Error parsing JSON for read_pdf_tool: {e}. Input: {action_input_json_str}"
         return text_content[:40000]
     except Exception as e: return f"Error reading PDF '{path}': {e}"
+@lc_tool_decorator
 def ocr_image_tool(action_input_json_str: str) -> str:
+    """Extracts text from an image using OCR. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns extracted text."""
     if not PIL_TESSERACT_AVAILABLE: return "Error: Pillow/Pytesseract not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
     except Exception as e: return f"Error parsing JSON for ocr_image_tool: {e}. Input: {action_input_json_str}"
     try: return pytesseract.image_to_string(Image.open(path))[:40000]
     except Exception as e: return f"Error OCR'ing '{path}': {e}"
+@lc_tool_decorator
 def transcribe_audio_tool(action_input_json_str: str) -> str:
+    """Transcribes speech from an audio file (or YouTube URL) to text. Input: JSON '{\"file_identifier\": \"FILENAME_OR_URL_OR_YOUTUBE_URL\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\"}'. Returns transcript."""
     global WHISPER_MODEL
     if not WHISPER_AVAILABLE: return "Error: Whisper not installed."
     try: data = json.loads(action_input_json_str); file_id, task_id = data.get("file_identifier"), data.get("task_id")
     try: result = WHISPER_MODEL.transcribe(path, fp16=False); return result["text"][:40000] # type: ignore
     except Exception as e: logger.error(f"Whisper error on '{path}': {e}", exc_info=True); return f"Error transcribing '{path}': {e}"
+@lc_tool_decorator
 def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
+    """Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-2.0-flash-exp) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
     global google_genai_client
     if not google_genai_client: return "Error: google-genai SDK client not initialized."
+    if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing."
     try:
         data = json.loads(action_input_json_str)
         file_identifier = data.get("file_identifier")
         text_prompt = data.get("text_prompt", "Describe this image.")
         task_id = data.get("task_id")
         if not file_identifier: return "Error: 'file_identifier' for image missing."
+        logger.info(f"Direct Multimodal Tool: Processing image '{file_identifier}' with prompt '{text_prompt}'")
         local_image_path = _download_file(file_identifier, task_id)
+        if local_image_path.startswith("Error:"): return f"Error downloading image for Direct Multimodal Tool: {local_image_path}"
         try:
             pil_image = Image.open(local_image_path)
+        except Exception as e_img_open: return f"Error opening image file {local_image_path}: {str(e_img_open)}"
         model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
         response = google_genai_client.models.generate_content(
+            model=model_id_for_client, contents=[pil_image, text_prompt]
         )
         logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
         return response.text[:40000]
+    except json.JSONDecodeError as e_json_mm: return f"Error parsing JSON input for Direct Multimodal Tool: {str(e_json_mm)}. Input: {action_input_json_str}"
     except Exception as e_tool_mm:
         logger.error(f"Error in direct_multimodal_gemini_tool: {e_tool_mm}", exc_info=True)
         return f"Error executing Direct Multimodal Tool: {str(e_tool_mm)}"
+# --- Agent Prompts ---
 LANGGRAPH_PROMPT_TEMPLATE_STR = """You are a highly intelligent agent for the GAIA benchmark.
 Your goal is to provide an EXACT MATCH final answer. No conversational text, explanations, or markdown unless explicitly part of the answer.
 TOOLS:
     logger.info("Initializing agent and tools...")
     if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY not set for LangChain LLM.")
+    # Corrected safety_settings format for ChatGoogleGenerativeAI
+    # Using INTEGER VALUES for HarmCategory keys and HarmBlockThreshold enum members for values.
     llm_safety_settings_corrected_final = {
         HarmCategory.HARM_CATEGORY_HARASSMENT.value: HarmBlockThreshold.BLOCK_NONE.value,
         HarmCategory.HARM_CATEGORY_HATE_SPEECH.value: HarmBlockThreshold.BLOCK_NONE.value,
             model=GEMINI_MODEL_NAME,
             google_api_key=GOOGLE_API_KEY,
             temperature=0.0,
+            safety_settings=llm_safety_settings_corrected_final,
             timeout=120,
             convert_system_message_to_human=True
         )
     demo.load(update_ui_on_load_fn_within_context, [], [agent_status_display, missing_secrets_display])
 if __name__ == "__main__":
+    logger.info(f"Application starting up (v7 - Final SafetySettings Fix)...")
     if not PYPDF2_AVAILABLE: logger.warning("PyPDF2 (PDF tool) NOT AVAILABLE.")
     if not PIL_TESSERACT_AVAILABLE: logger.warning("Pillow/Pytesseract (OCR tool) NOT AVAILABLE.")
     if not WHISPER_AVAILABLE: logger.warning("Whisper (Audio tool) NOT AVAILABLE.")
     logger.info(f"Space ID: {os.getenv('SPACE_ID', 'Not Set')}")
     logger.info("Gradio Interface launching...")
+    demo.queue().launch(debug=os.getenv("GRADIO_DEBUG","false").lower()=="true", share=False, max_threads=20)