import os import requests from tools.base import Tool from tools.utils import ToolExecutionError, logger from core.utils.ncbi_rate_limited import rate_limited_pubmed_search ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" ESUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" from typing import Any, Dict, List, Union class PubMedSearchTool(Tool): def openai_spec(self, legacy=False): return { "name": self.name, "description": self.description, "parameters": self.args_schema } """ Tool for searching PubMed for articles and returning top results using NCBI E-utilities. This tool uses the NCBI E-utilities API to search PubMed for articles and returns the top results. """ def __init__(self) -> None: """ Initialize the PubMedSearchTool with its name, description, and argument schema. """ super().__init__() self.name = "search_pubmed" self.description = "Search PubMed for articles and return top results." self.args_schema = { "type": "object", "properties": { "q": {"type": "string", "description": "search query"}, "max_results": {"type": "integer", "default": 5}, "email": {"type": "string", "description": "user email for NCBI API", "default": os.getenv("NCBI_EMAIL", "")} }, "required": ["q"] } async def run( self, q: str, max_results: int = 5, email: str = "" ) -> Union[List[Dict[str, Any]], Dict[str, Any]]: """ Search PubMed for articles and return the top results. Args: q (str): The search query. max_results (int, optional): The maximum number of results to return. Defaults to 5. email (str, optional): User email for NCBI API. Defaults to environment variable NCBI_EMAIL. Returns: Union[List[Dict[str, Any]], Dict[str, Any]]: A list of article result dicts, or an error dict. """ try: # Use provided email or fall back to environment variable or default if not email: email = os.getenv("NCBI_EMAIL", "") if not email: # Use a default academic email for research purposes email = "research@idweek.com" logger.info("Using default email for NCBI API access") api_key = os.getenv("NCBI_API_KEY") # Use rate-limited PubMed search with automatic caching and retry logic response_data = await rate_limited_pubmed_search( query=q, api_key=api_key, max_results=max_results ) if not response_data or "esearchresult" not in response_data: logger.warning(f"PubMedSearchTool: No valid response from rate-limited search") return [] idlist = response_data["esearchresult"].get("idlist", []) if not idlist: return [] # Fetch summaries for the article IDs params_esummary = { "db": "pubmed", "id": ",".join(idlist), "retmode": "json", "tool": "IDweekAgent", "email": email } if api_key: params_esummary["api_key"] = api_key resp2 = requests.get(ESUMMARY_URL, params=params_esummary, timeout=15) resp2.raise_for_status() summary = resp2.json().get("result", {}) results = [] for uid in idlist: item = summary.get(uid, {}) results.append({ "uid": uid, "title": item.get("title"), "authors": [a.get("name") for a in item.get("authors", [])], "pubdate": item.get("pubdate"), "source": item.get("source"), "link": f"https://pubmed.ncbi.nlm.nih.gov/{uid}/" }) return results except Exception as e: logger.error(f"PubMedSearchTool failed: {e}", exc_info=True) raise ToolExecutionError(f"PubMedSearchTool failed: {e}")