Spaces:

jesusvilela
/

DearDreadyUnit4

Sleeping

App Files Files Community

jesusvilela commited on Jun 25

Commit

12a98dc

verified ·

1 Parent(s): 170baad

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -26

app.py CHANGED Viewed

@@ -61,14 +61,14 @@ if TYPE_CHECKING:
 LANGGRAPH_FLAVOR_AVAILABLE = False
 LG_StateGraph: Optional[Type[Any]] = None
 LG_ToolExecutor_Class: Optional[Type[Any]] = None
-LG_END: Optional[Any] = None
 LG_ToolInvocation: Optional[Type[Any]] = None
 add_messages: Optional[Any] = None
 MemorySaver_Class: Optional[Type[Any]] = None
 AGENT_INSTANCE: Optional[Union[AgentExecutor, Any]] = None
 TOOLS: List[BaseTool] = []
-LLM_INSTANCE: Optional[ChatGoogleGenerativeAI] = None
 LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
 # google-genai Client SDK
@@ -126,8 +126,8 @@ except ImportError as e:
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-05-06"
-GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-2.0-flash-exp"
 SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
 MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
 LOCAL_FILE_STORE_PATH = "./Data"
@@ -255,7 +255,8 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
             name_without_ext, current_ext = os.path.splitext(effective_save_path)
             if not current_ext:
                 content_type_header = r.headers.get('content-type', '')
-                content_type_val = content_type_header.split(';').strip() if content_type_header else ''
                 if content_type_val:
                     guessed_ext = mimetypes.guess_extension(content_type_val)
                     if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
@@ -324,7 +325,7 @@ def transcribe_audio_tool(action_input_json_str: str) -> str:
 @lc_tool_decorator
 def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
-    """Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-2.0-flash-exp) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
     global google_genai_client
     if not google_genai_client: return "Error: google-genai SDK client not initialized."
     if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing." # Relies on PIL_TESSERACT_AVAILABLE for PIL
@@ -360,9 +361,9 @@ You have access to the following tools. Use them if necessary.
 {tools}
 TOOL USAGE:
 - To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
-- For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): `args` must contain 'file_identifier' (filename/URL) and 'task_id' (if GAIA file). For 'direct_multimodal_gemini_tool', also include 'text_prompt'.
 - 'web_search': `args` is like '{{"query": "search query"}}'.
-- 'python_repl': `args` is like '{{"command": "python code string"}}'. Use print() for output.
 RESPONSE FORMAT:
 Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
 Begin!
@@ -376,8 +377,7 @@ Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -
 Tool Inputs:
 - web_search: Your search query string.
 - python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
-- read_pdf_tool, ocr_image_tool, transcribe_audio_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
-- direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "IMAGE_FILENAME_OR_URL", "text_prompt": "Your prompt for the image.", "task_id": "TASK_ID_IF_GAIA_FILENAME"}}'.
 If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
 Begin!
 {input}
@@ -422,28 +422,27 @@ def initialize_agent_and_tools(force_reinit=False):
         try:
             logger.info(f"Attempting LangGraph init (Tool Executor type: {LG_ToolExecutor_Class.__name__ if LG_ToolExecutor_Class else 'None'})")
             _TypedDict = getattr(__import__('typing_extensions'), 'TypedDict', dict)
-            class AgentState(_TypedDict): input: str; messages: Annotated[List[Any], add_messages]
             # System prompt template - this describes the agent's role and tools.
             # The {input} placeholder for the actual task will be filled by the HumanMessage.
-            base_system_prompt_content_lg = LANGGRAPH_PROMPT_TEMPLATE_STR.split("{input}")[0].strip() + "\nTOOLS:\n{tools}\nRESPONSE FORMAT:\nFinal AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.\nBegin!"
             def agent_node(state: AgentState):
-                current_task_query = state.get('input', '') # The specific question/task for this turn
                 system_message_content = base_system_prompt_content_lg.format(
                     tools="\n".join([f"- {t.name}: {t.description}" for t in TOOLS])
                 )
                 messages_for_llm = [SystemMessage(content=system_message_content)]
-                messages_for_llm.extend(state.get('messages', [])) # Add history
-                messages_for_llm.append(HumanMessage(content=current_task_query)) # Add current task as HumanMessage
                 logger.debug(f"LangGraph agent_node - messages_for_llm: {messages_for_llm}")
-                if not messages_for_llm[-1].content or not str(messages_for_llm[-1].content).strip():
-                    logger.error("LLM call would fail in agent_node: Last HumanMessage content is empty or invalid.")
-                    return {"messages": [AIMessage(content="[ERROR] Agent node: Current task input (HumanMessage) is empty.")]}
                 bound_llm = LLM_INSTANCE.bind_tools(TOOLS)
                 response = bound_llm.invoke(messages_for_llm)
@@ -478,9 +477,6 @@ def initialize_agent_and_tools(force_reinit=False):
             workflow_lg = LG_StateGraph(AgentState) # type: ignore
             workflow_lg.add_node("agent", agent_node)
-            # If LG_ToolExecutor_Class is ToolNode, it can often be added directly as the node.
-            # workflow_lg.add_node("tools", tool_executor_instance_lg)
-            # For now, using the custom tool_node which wraps the executor instance.
             workflow_lg.add_node("tools", tool_node)
             workflow_lg.set_entry_point("agent")
             def should_continue_lg(state: AgentState): return "tools" if state['messages'][-1].tool_calls else LG_END
@@ -528,7 +524,8 @@ def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Opti
     try:
         if is_langgraph_agent_get:
             logger.debug(f"Using LangGraph agent for thread: {thread_id_to_use}")
-            input_for_lg_get = {"input": prompt, "messages": []}
             logger.debug(f"Invoking LangGraph with input: {input_for_lg_get}")
             final_state_lg_get = AGENT_INSTANCE.invoke(input_for_lg_get, {"configurable": {"thread_id": thread_id_to_use}})
@@ -574,14 +571,12 @@ def get_agent_response(prompt: str, task_id: Optional[str]=None, thread_id: Opti
         return f"[ERROR] Agent execution failed: {str(e_agent_run_get)[:150]}"
 def construct_prompt_for_agent(q: Dict[str,Any]) -> str:
-    # ... (Your original construct_prompt_for_agent logic - unchanged) ...
     tid,q_str=q.get("task_id","N/A"),q.get("question",""); files=q.get("files",[])
     files_info = ("\nFiles:\n"+"\n".join([f"- {f} (task_id:{tid})"for f in files])) if files else ""
     level = f"\nLevel:{q.get('level')}" if q.get('level') else ""
     return f"Task ID:{tid}{level}{files_info}\n\nQuestion:{q_str}"
 def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
-    # ... (Your original run_and_submit_all logic - unchanged) ...
     global AGENT_INSTANCE
     space_id = os.getenv("SPACE_ID")
     username_for_submission = None

 LANGGRAPH_FLAVOR_AVAILABLE = False
 LG_StateGraph: Optional[Type[Any]] = None
 LG_ToolExecutor_Class: Optional[Type[Any]] = None
+LG_END: Optional[Any]] = None
 LG_ToolInvocation: Optional[Type[Any]] = None
 add_messages: Optional[Any] = None
 MemorySaver_Class: Optional[Type[Any]] = None
 AGENT_INSTANCE: Optional[Union[AgentExecutor, Any]] = None
 TOOLS: List[BaseTool] = []
+LLM_INSTANCE: Optional[ChatGoogleGenerativeAI]] = None
 LANGGRAPH_MEMORY_SAVER: Optional[Any] = None
 # google-genai Client SDK
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+GEMINI_MODEL_NAME = "gemini-1.5-pro-preview-0514"
+GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-1.5-flash-preview-0514"
 SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
 MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
 LOCAL_FILE_STORE_PATH = "./Data"
             name_without_ext, current_ext = os.path.splitext(effective_save_path)
             if not current_ext:
                 content_type_header = r.headers.get('content-type', '')
+                # FIX: Handle split correctly and take first part before stripping
+                content_type_val = content_type_header.split(';')[0].strip() if content_type_header else ''
                 if content_type_val:
                     guessed_ext = mimetypes.guess_extension(content_type_val)
                     if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
 @lc_tool_decorator
 def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
+    """Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-1.5-flash-preview-0514) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
     global google_genai_client
     if not google_genai_client: return "Error: google-genai SDK client not initialized."
     if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing." # Relies on PIL_TESSERACT_AVAILABLE for PIL
 {tools}
 TOOL USAGE:
 - To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
+- For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): The `args` field must be a dictionary with a single key 'action_input_json_str' whose value is a JSON STRING. Example: {{"action_input_json_str": "{{\\"file_identifier\\": \\"file.pdf\\", \\"task_id\\": \\"123\\"}}"}}.
 - 'web_search': `args` is like '{{"query": "search query"}}'.
+- 'python_repl': `args` is like '{{"query": "python code string"}}'. Use print() for output.
 RESPONSE FORMAT:
 Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
 Begin!
 Tool Inputs:
 - web_search: Your search query string.
 - python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
+- read_pdf_tool, ocr_image_tool, transcribe_audio_tool, direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
 If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
 Begin!
 {input}
         try:
             logger.info(f"Attempting LangGraph init (Tool Executor type: {LG_ToolExecutor_Class.__name__ if LG_ToolExecutor_Class else 'None'})")
             _TypedDict = getattr(__import__('typing_extensions'), 'TypedDict', dict)
+            # FIX: Remove 'input' key from state, only use 'messages' for conversational flow
+            class AgentState(_TypedDict):
+                messages: Annotated[List[Any], add_messages]
             # System prompt template - this describes the agent's role and tools.
             # The {input} placeholder for the actual task will be filled by the HumanMessage.
+            base_system_prompt_content_lg = LANGGRAPH_PROMPT_TEMPLATE_STR.split("{input}")[0].strip()
             def agent_node(state: AgentState):
                 system_message_content = base_system_prompt_content_lg.format(
                     tools="\n".join([f"- {t.name}: {t.description}" for t in TOOLS])
                 )
+                # FIX: Construct message list from state, don't re-add original prompt
                 messages_for_llm = [SystemMessage(content=system_message_content)]
+                messages_for_llm.extend(state['messages'])
                 logger.debug(f"LangGraph agent_node - messages_for_llm: {messages_for_llm}")
+                if not messages_for_llm or not any(isinstance(m, (HumanMessage, ToolMessage)) for m in messages_for_llm):
+                    logger.error("LLM call would fail in agent_node: No HumanMessage or ToolMessage found in history.")
+                    return {"messages": [AIMessage(content="[ERROR] Agent node: No user input found in messages.")]}
                 bound_llm = LLM_INSTANCE.bind_tools(TOOLS)
                 response = bound_llm.invoke(messages_for_llm)
             workflow_lg = LG_StateGraph(AgentState) # type: ignore
             workflow_lg.add_node("agent", agent_node)
             workflow_lg.add_node("tools", tool_node)
             workflow_lg.set_entry_point("agent")
             def should_continue_lg(state: AgentState): return "tools" if state['messages'][-1].tool_calls else LG_END
     try:
         if is_langgraph_agent_get:
             logger.debug(f"Using LangGraph agent for thread: {thread_id_to_use}")
+            # FIX: The input should be a list of messages for the 'add_messages' reducer.
+            input_for_lg_get = {"messages": [HumanMessage(content=prompt)]}
             logger.debug(f"Invoking LangGraph with input: {input_for_lg_get}")
             final_state_lg_get = AGENT_INSTANCE.invoke(input_for_lg_get, {"configurable": {"thread_id": thread_id_to_use}})
         return f"[ERROR] Agent execution failed: {str(e_agent_run_get)[:150]}"
 def construct_prompt_for_agent(q: Dict[str,Any]) -> str:
     tid,q_str=q.get("task_id","N/A"),q.get("question",""); files=q.get("files",[])
     files_info = ("\nFiles:\n"+"\n".join([f"- {f} (task_id:{tid})"for f in files])) if files else ""
     level = f"\nLevel:{q.get('level')}" if q.get('level') else ""
     return f"Task ID:{tid}{level}{files_info}\n\nQuestion:{q_str}"
 def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
     global AGENT_INSTANCE
     space_id = os.getenv("SPACE_ID")
     username_for_submission = None