import os import gradio as gr import requests import pandas as pd import re import logging from agent import initialize_agent # Import the agent initialization function # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- Helper Functions --- from smolagents import tool as smol_tool def tool(*args, **kwargs): """Decorator for registering a function as a tool (patched for docstring).""" return smol_tool(*args, **kwargs) def extract_final_answer_from_response(response: str) -> str: """ Extract the final answer from agent response following GAIA format. The agent should return responses ending with 'FINAL ANSWER: [answer]' """ if not response: return "" # The agent wrapper should already return just the final answer # but this is a safety check in case the format isn't perfect if isinstance(response, str): # Look for FINAL ANSWER pattern final_answer_pattern = re.compile(r'FINAL\s+ANSWER\s*:\s*(.+?)(?:\n|$)', re.IGNORECASE | re.DOTALL) match = final_answer_pattern.search(response) if match: answer = match.group(1).strip() # Clean up the answer answer = re.sub(r'\s+', ' ', answer) answer = answer.rstrip('.') return answer # If no FINAL ANSWER pattern found, return the response as is # (the agent wrapper should have already cleaned it) return str(response).strip() def _fetch_questions(api_url: str) -> list: """Fetches evaluation questions from the API.""" questions_url = f"{api_url}/questions" logger.info(f"Fetching questions from: {questions_url}") try: response = requests.get(questions_url, timeout=15) response.raise_for_status() questions_data = response.json() if not questions_data: raise ValueError("Fetched questions list is empty or invalid format.") logger.info(f"Fetched {len(questions_data)} questions.") return questions_data except requests.exceptions.RequestException as e: raise RuntimeError(f"Error fetching questions: {e}") from e except requests.exceptions.JSONDecodeError as e: raise RuntimeError(f"Error decoding JSON response from questions endpoint: {e}. Response: {response.text[:500]}") from e except Exception as e: raise RuntimeError(f"An unexpected error occurred fetching questions: {e}") from e def _run_agent_on_questions(agent, questions_data: list) -> tuple[list, list]: """Runs the agent on each question and collects answers and logs.""" results_log = [] answers_payload = [] logger.info(f"Running agent on {len(questions_data)} questions...") for item in questions_data: task_id = item.get("task_id") question_text = item.get("question") if not task_id or question_text is None: logger.warning(f"Skipping item with missing task_id or question: {item}") continue try: logger.info(f"Processing task {task_id}: {question_text[:100]}...") # The agent is now wrapped to return GAIA-compliant format raw_response = agent(question_text) # Extract the final answer (should already be clean from wrapper) submitted_answer = extract_final_answer_from_response(raw_response) # Log the full interaction for debugging logger.info(f"Task {task_id} - Raw response: {raw_response}") logger.info(f"Task {task_id} - Final answer: {submitted_answer}") answers_payload.append({ "task_id": task_id, "submitted_answer": submitted_answer }) results_log.append({ "Task ID": task_id, "Question": question_text, "Raw Response": raw_response, "Final Answer": submitted_answer }) except Exception as e: error_msg = f"AGENT ERROR: {e}" logger.error(f"Error running agent on task {task_id}: {e}") answers_payload.append({ "task_id": task_id, "submitted_answer": error_msg }) results_log.append({ "Task ID": task_id, "Question": question_text, "Raw Response": error_msg, "Final Answer": error_msg }) return answers_payload, results_log def _submit_answers(api_url: str, username: str, agent_code_url: str, answers_payload: list) -> dict: """Submits the agent's answers to the evaluation API.""" submit_url = f"{api_url}/submit" submission_data = { "username": username.strip(), "agent_code": agent_code_url, "answers": answers_payload } logger.info(f"Submitting {len(answers_payload)} answers for user '{username}' to: {submit_url}") try: response = requests.post(submit_url, json=submission_data, timeout=60) response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: error_detail = f"Server responded with status {e.response.status_code}." try: error_json = e.response.json() error_detail += f" Detail: {error_json.get('detail', e.response.text)}" except requests.exceptions.JSONDecodeError: error_detail += f" Response: {e.response.text[:500]}" raise RuntimeError(f"Submission Failed: {error_detail}") from e except requests.exceptions.Timeout: raise RuntimeError("Submission Failed: The request timed out.") from e except requests.exceptions.RequestException as e: raise RuntimeError(f"Submission Failed: Network error - {e}") from e except Exception as e: raise RuntimeError(f"An unexpected error occurred during submission: {e}") from e # --- Main Gradio Function --- def run_and_submit_all(profile: gr.OAuthProfile | None): """ Orchestrates the fetching of questions, running the agent, and submitting answers. """ username = None if profile: username = profile.username logger.info(f"User logged in: {username}") else: logger.info("User not logged in.") return "Please Login to Hugging Face with the button.", None if not username: return "Hugging Face username not found. Please ensure you are logged in.", None space_id = os.getenv("SPACE_ID") if not space_id: logger.error("SPACE_ID environment variable not found. Cannot determine agent_code URL.") return "Error: SPACE_ID not set. Cannot determine agent_code URL.", None agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" status_message = "" results_df = pd.DataFrame() try: # 1. Instantiate Agent logger.info("Initializing agent...") agent = initialize_agent() if agent is None: raise RuntimeError("Agent initialization failed. Check agent.py for details.") logger.info("Agent initialized successfully.") # 2. Fetch Questions questions_data = _fetch_questions(DEFAULT_API_URL) # 3. Run Agent on Questions answers_payload, results_log = _run_agent_on_questions(agent, questions_data) if not answers_payload: status_message = "Agent did not produce any answers to submit." return status_message, pd.DataFrame(results_log) # 4. Submit Answers submission_result = _submit_answers(DEFAULT_API_URL, username, agent_code_url, answers_payload) final_status = ( f"🎉 Submission Successful!\n" f"👤 User: {submission_result.get('username')}\n" f"📊 Overall Score: {submission_result.get('score', 'N/A')}% " f"({submission_result.get('correct_count', '?')}/{submission_result.get('total_attempted', '?')} correct)\n" f"đŸ’Ŧ Message: {submission_result.get('message', 'No message received.')}\n" f"🔗 Agent Code: {agent_code_url}" ) status_message = final_status results_df = pd.DataFrame(results_log) except RuntimeError as e: status_message = f"❌ Operation Failed: {e}" logger.error(status_message) # If an error occurs during agent run, results_log might be partially filled if 'results_log' in locals(): results_df = pd.DataFrame(results_log) else: results_df = pd.DataFrame([{"Status": "Error", "Details": str(e)}]) except Exception as e: status_message = f"đŸ’Ĩ Critical Error: {e}" logger.error(status_message) results_df = pd.DataFrame([{"Status": "Critical Error", "Details": str(e)}]) return status_message, results_df # --- Gradio Interface Definition --- with gr.Blocks(title="GAIA Benchmark Agent", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🧠 GAIA Benchmark Evaluation Agent **Enhanced AI Agent for General AI Assistant (GAIA) Benchmark** """) gr.Markdown(""" ## 📋 Instructions: 1. **Setup**: Clone this Space and ensure your `.env` file contains: ``` TOGETHER_API_KEY=your_together_api_key SERPAPI_API_KEY=your_serpapi_key ``` 2. **Login**: Use the button below to log in with your Hugging Face account 3. **Run**: Click 'Run Evaluation & Submit' to process all GAIA questions 4. **Wait**: The process may take several minutes depending on question complexity --- ### đŸŽ¯ GAIA Format Requirements: - **Numbers**: No commas, no units (unless specified) - **Strings**: No articles (a, an, the), no abbreviations - **Lists**: Comma-separated values following above rules ### 🔧 Agent Capabilities: - **Web Research**: Google Search, Wikipedia, webpage analysis - **Video Analysis**: YouTube transcript processing - **Mathematical Computing**: Python execution with scientific libraries - **Multi-step Reasoning**: Complex problem decomposition """) with gr.Row(): gr.LoginButton(scale=1) run_button = gr.Button("🚀 Run Evaluation & Submit All Answers", variant="primary", scale=2) status_output = gr.Textbox( label="📊 Evaluation Status & Results", lines=8, interactive=False, placeholder="Click 'Run Evaluation' to start the process..." ) results_table = gr.DataFrame( label="📝 Detailed Question Results", wrap=True, interactive=False, column_widths=["10%", "40%", "25%", "25%"] ) run_button.click( fn=run_and_submit_all, outputs=[status_output, results_table] ) gr.Markdown(""" --- ### 💡 Tips for Better Performance: - Ensure stable internet connection for web searches - Monitor the status output for real-time progress - Check the detailed results table for individual question analysis - The agent automatically formats answers according to GAIA requirements """) if __name__ == "__main__": print("\n" + "="*70) print("🚀 GAIA BENCHMARK AGENT STARTING") print("="*70) # Check environment variables space_host = os.getenv("SPACE_HOST") space_id = os.getenv("SPACE_ID") together_key = os.getenv("TOGETHER_API_KEY") serpapi_key = os.getenv("SERPAPI_API_KEY") if space_host: print(f"✅ SPACE_HOST: {space_host}") print(f" 🌐 Runtime URL: https://{space_host}.hf.space") else: print("â„šī¸ SPACE_HOST not found (local development)") if space_id: print(f"✅ SPACE_ID: {space_id}") print(f" 📂 Repo URL: https://huggingface.co/spaces/{space_id}") else: print("âš ī¸ SPACE_ID not found - submissions may fail") print(f"🔑 API Keys Status:") print(f" Together AI: {'✅ Set' if together_key else '❌ Missing'}") print(f" SerpAPI: {'✅ Set' if serpapi_key else 'âš ī¸ Missing (optional)'}") print("="*70) print("đŸŽ¯ Launching GAIA Benchmark Interface...") print("="*70 + "\n") demo.launch(debug=True, share=False)