Spaces:

Agents-MCP-Hackathon
/

BioMedNorm-MCP-Server

Sleeping

@@ -1,14 +1,82 @@
 ---
 title: BioMedNorm MCP Server
-emoji: 🚀
-colorFrom: pink
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.33.1
 app_file: app.py
-pinned: false
 license: apache-2.0
-short_description: 'A MCP server for extracting and normalizing domain-specific '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: BioMedNorm MCP Server
 sdk: gradio
+sdk_version: 5.33.0
 app_file: app.py
+pinned: true
 license: apache-2.0
+python_version: 3.13.3
+tags:
+- mcp-server-track
 ---
+# BioMedNorm MCP Server
+A MCP server for extracting and normalizing domain-specific entities from biomedical text. We leverage OpenAI LLMs to identify entities and match them to standardized terminology.
+## Installation
+This project uses `uv` from Astral for dependency management. Follow these steps to set up the project:
+### Clone the repository
+```bash
+git clone https://github.com/yourusername/entity-extraction-mcp
+cd entity-extraction-mcp
+```
+### Set up Python environment
+The project includes a .python-version file that specifies the required Python version. Make sure you have uv installed:
+```bash
+# Install uv if you don't have it already
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+### Install dependencies
+The project dependencies are defined in `pyproject.toml`. Install them with:
+```bash
+uv pip install -e .
+```
+### Set up environment variables
+The project **requires** an OpenAI API key, which should be stored in a .env file.
+## Running the application
+Run the application using `uv run`:
+```bash
+uv run app.py
+```
+This command ensures that:
+- All project dependencies are correctly installed
+- The environment variables from .env are loaded
+- The application runs in the proper environment
+After starting the server, you can access:
+- Web interface: `http://your-server:port`
+- MCP endpoint: `http://your-server:port/gradio_api/mcp/sse`
+## Using the Web Interface
+- Enter text in the input area
+- Select the entity type (Disease, Tissue, or Cell Type)
+- Click "Normalize"
+- View the normalized entities in the results area
+## Using as an MCP Tool
+The server exposes an MCP-compatible endpoint that can be used by AI agents. The tool accepts:
+- `paragraph`: Text to extract entities from
+- `target_entity`: Type of entity to extract ("Disease", "Tissue", or "Cell Type")
+and returns a list of normalized entities.

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+BioMed text normalization MCP server.
+"""
+# import time
+import asyncio
+from typing import Optional
+import gradio as gr
+from oaklib_utils import get_candidates
+from openai_utils import ask_openai
+NER_PROMPT = """
+  You are an expert annotator of biomedical text.
+  Annotate/Identify/Extract all {entity}s in this text: {text}
+  Instructions:
+  1. If no such entity or entities are found, then **return exactly**: Not Found
+  2. Extract only the entity. If only an abbreviation is present, expand it based on the
+  biomedical context in the given paragraph. For e.g., BA12 full form is Brodmann (1909) area 12.
+  3. Do not provide any additional information or formatting.
+  Do not guess or hallucinate if you are uncertain. This has high-stakes, so it's better to be safe
+  than sorry. This is very important, so you'd better be sure of your answer, OK?
+"""
+RAG_PROMPT = """
+  You are an expert normalizer of biomedical entities.
+  Given the following list of candidate standard terms: {top_k_preds},
+  find the single closest matching term for this unnormalized entity: {entity}.
+  Instructions:
+  1. **IMPORTANT:** Do **NOT** guess or hallucinate. Do **NOT** provide any term that
+  is not explicitly present in the list of standardized terms.
+  2. Do not overgeneralize unless no match is available.
+  3. Do not provide any additional information or formatting.
+  This has high-stakes, so it's better to be safe than sorry. This is very important, so you'd better
+  be sure of your answer, OK?
+"""
+async def extract_entities(paragraph: str, target_entity: str) -> Optional[list[str]]:
+  """
+  Extract entities of a specific type from a given paragraph.
+  Args:
+    paragraph (str): The paragraph from which entities are to be extracted.
+    target_entity (str): The type of entity to extract from the paragraph (e.g., 'disease', 'tissue').
+  Returns:
+    Optional[list[str]]: A list of extracted entities of the specified type, or
+    None if the model did not return a valid response.
+  """
+  prompt = NER_PROMPT.format(entity=target_entity, text=paragraph)
+  extracted_entities = await ask_openai(prompt, usage="ner")
+  return extracted_entities
+async def normalize_entities(raw_terms: list[str]) -> list[Optional[str]]:
+  """
+  Normalize a list of raw terms to the most appropriate standard terms from a list
+  of candidates.
+  This function is designed to process the output from extract_entities().
+  Args:
+    raw_terms (list[str]): List of unnormalized terms, typically from extract_entities().
+  Returns:
+    list[Optional[str]]: List of best matching standard terms in the same order as the
+    input terms. An entry may be None if normalization failed.
+  """
+  # Do normalization for each entity
+  async def process_single_entity(raw_term: str) -> Optional[str]:
+    # Generate candidates specifically for this entity
+    # If the oaklib function is not async, wrap it with run_in_executor
+    candidate_std_terms = await asyncio.to_thread(get_candidates, raw_term)
+    # Now use these entity-specific candidates for the OpenAI call
+    prompt = RAG_PROMPT.format(entity=raw_term, top_k_preds=candidate_std_terms)
+    result = await ask_openai(prompt, usage="rag")
+    return result
+  # Process all entities in parallel
+  tasks = [process_single_entity(entity) for entity in raw_terms]
+  normalized_entities = await asyncio.gather(*tasks)
+  return normalized_entities
+async def extract_and_normalize(
+  paragraph: str, target_entity: str
+) -> list[Optional[str]]:
+  """
+  Extract entities from a paragraph and normalize them in one operation.
+  Args:
+    paragraph: The paragraph from which to extract entities.
+    target_entity: The type of entity to extract and normalize.
+  Returns:
+    list[Optional[str]]: List of best matching standard terms in the same order as the
+    input terms. An entry may be None if normalization failed.
+  """
+  extracted_entities = await extract_entities(paragraph, target_entity)
+  if not extracted_entities or len(extracted_entities) == 0:
+    return []
+  result = await normalize_entities(extracted_entities)
+  return result
+# Create a visually appealing Gradio app
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+  gr.Markdown("# Entity Extraction & Normalization")
+  gr.Markdown(
+    "Enter text and specify the entity type to extract and normalize entities."
+  )
+  with gr.Row():
+    with gr.Column(scale=3):
+      paragraph = gr.Textbox(
+        label="Text Input",
+        placeholder="Enter paragraph here...",
+        lines=5,
+        info="Enter biomedical text input for entity extraction.",
+      )
+    with gr.Column(scale=1):
+      target_entity = gr.Dropdown(
+        ["Disease", "Tissue", "Cell Type"],
+        label="Entity Type",
+        value="Disease",
+        info="Select the type of entity you want to extract and normalize from the text.",
+      )
+  normalize_btn = gr.Button("Normalize", variant="primary")
+  with gr.Row():
+    with gr.Column():
+      output = gr.JSON(label="Normalized Entities")
+  # Add a loading indicator
+  with gr.Row():
+    status = gr.Markdown("")
+  with gr.Accordion("Example Inputs", open=False):
+    gr.Examples(
+      examples=[
+        ["The patient was diagnosed with diabetes and hypertension.", "Disease"],
+        [
+          "Samples of BA12 tissue, weighing approximately 50-100 mg each, were homogenized in nuclei extraction buffer.",
+          "Tissue",
+        ],
+        [
+          "Coupling scTCR-seq with scRNA-seq can reveal the relationship between clonotype and phenotype in T or B cell populations.",
+          "Cell Type",
+        ],
+      ],
+      inputs=[paragraph, target_entity],
+    )
+  # Set up the button click event
+  normalize_btn.click(
+    lambda: "Processing...",  # Show loading immediately
+    None,
+    status,
+    queue=False,
+  ).then(
+    extract_and_normalize,  # Async processing
+    [paragraph, target_entity],
+    output,
+  ).then(
+    lambda: "",  # Clear status
+    None,
+    status,
+  )
+if __name__ == "__main__":
+  demo.launch(mcp_server=True)

oaklib_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Retrieve top k candidate standard terms for normalization using oaklib.
+"""
+# import argparse
+from oaklib import get_adapter
+from oaklib.datamodels.search import SearchConfiguration
+adapter = get_adapter("ols:")
+def get_candidates(term: str, top_k: int = 10) -> list[str]:
+  """
+  Get top k candidates for RAG.
+  """
+  # Set config for search (limit # terms returned)
+  cfg = SearchConfiguration(limit=top_k)
+  results = adapter.basic_search(term, config=cfg)
+  labels = list(adapter.labels(results))  # list of tuples of ids and labels
+  # print(f"## Query: {term} -> {labels}")
+  candidates = list(label for _, label in labels)
+  return candidates
+# def main():
+#   parser = argparse.ArgumentParser(
+#     description="Fetch top-K candidate passages for a given term (RAG)"
+#   )
+#   parser.add_argument(
+#     "term", type=str, help="The query term or prompt for which to retrieve candidates"
+#   )
+#   parser.add_argument(
+#     "-k",
+#     "--top_k",
+#     type=int,
+#     default=10,
+#     help="Number of top candidates to return (default: 10)",
+#   )
+#   args = parser.parse_args()
+#   # Call your function
+#   candidates = get_candidates(args.term)
+#   print(f"\nTerm: {args.term!r}")
+#   print(f"Top {args.top_k} candidates:")
+#   for i, cand in enumerate(candidates, start=1):
+#     print(f"  {i:2d}. {cand}")
+# if __name__ == "__main__":
+#   main()

openai_utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Helper functions for structured OpenAI API calls using Pydantic models.
+Includes NER and RAG-specific prompting logic with retry and error handling.
+"""
+import os
+from typing import Literal, Optional, overload, Union
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+from pydantic import BaseModel, Field
+from tenacity import retry, retry_if_result, stop_after_attempt, wait_random_exponential
+from tqdm.auto import tqdm
+load_dotenv()  # take environment variables from .env
+api_key = os.getenv("OPENAI_API_KEY")
+if not api_key:
+  raise EnvironmentError("Missing OPENAI_API_KEY in environment.")
+client = AsyncOpenAI(api_key=api_key, timeout=120.0)
+class NEROutput(BaseModel):
+  answer: list[str] = Field(..., description="List of extracted entities")
+class RAGOutput(BaseModel):
+  answer: str = Field(..., description="Closest match to input term")
+  reason: str = Field(..., description="Why you chose the answer match to input term")
+def is_invalid_result(result):
+  return result is None
+@overload
+async def ask_openai(user_prompt: str, usage: Literal["ner"], model: str = ...) -> Optional[list[str]]: ...
+@overload
+async def ask_openai(user_prompt: str, usage: Literal["rag"], model: str = ...) -> Optional[str]: ...
+@retry(
+  retry=retry_if_result(is_invalid_result),
+  wait=wait_random_exponential(min=1, max=60),
+  stop=stop_after_attempt(6),
+)
+async def ask_openai(
+  user_prompt: str,
+  usage: Literal['ner', 'rag'],
+  model: str = "o4-mini-2025-04-16",
+) -> Optional[Union[list[str], str]]:
+  """
+  Function to interact with the OpenAI API.
+  """
+  if model in ["chatgpt-4o-latest", "o1-mini"]:
+    raise ValueError(f"Model {model} does not support structured outputs.")
+  response_format = NEROutput if usage == 'ner' else RAGOutput
+  try:
+    response = await client.responses.parse(
+      model=model,
+      input=[{"role": "user", "content": user_prompt}],
+      text_format=response_format,
+      # temperature=0.05,
+    )
+    response_obj = response.output_parsed
+    return response_obj.answer if response_obj else None
+  except Exception as e:
+    tqdm.write(f"❌ Unexpected error. Error: {e}")
+    raise

pyproject.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+[project]
+name = "biomednorm-mcp-server"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "oaklib>=0.6.23",
+    "openai>=1.84.0",
+    "python-dotenv>=1.1.0",
+]

ruff.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ indent-width = 2
2	+ target-version = "py312"

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff