RohanKarthikeyan's picture
Upload 9 files
046bc11 verified
raw
history blame
6.04 kB
"""
BioMed text normalization MCP server.
"""
# import time
import asyncio
from typing import Optional
import gradio as gr
from oaklib_utils import get_candidates
from openai_utils import ask_openai
NER_PROMPT = """
You are an expert annotator of biomedical text.
Annotate/Identify/Extract all {entity}s in this text: {text}
Instructions:
1. If no such entity or entities are found, then **return exactly**: Not Found
2. Extract only the entity. If only an abbreviation is present, expand it based on the
biomedical context in the given paragraph. For e.g., BA12 full form is Brodmann (1909) area 12.
3. Do not provide any additional information or formatting.
Do not guess or hallucinate if you are uncertain. This has high-stakes, so it's better to be safe
than sorry. This is very important, so you'd better be sure of your answer, OK?
"""
RAG_PROMPT = """
You are an expert normalizer of biomedical entities.
Given the following list of candidate standard terms: {top_k_preds},
find the single closest matching term for this unnormalized entity: {entity}.
Instructions:
1. **IMPORTANT:** Do **NOT** guess or hallucinate. Do **NOT** provide any term that
is not explicitly present in the list of standardized terms.
2. Do not overgeneralize unless no match is available.
3. Do not provide any additional information or formatting.
This has high-stakes, so it's better to be safe than sorry. This is very important, so you'd better
be sure of your answer, OK?
"""
async def extract_entities(paragraph: str, target_entity: str) -> Optional[list[str]]:
"""
Extract entities of a specific type from a given paragraph.
Args:
paragraph (str): The paragraph from which entities are to be extracted.
target_entity (str): The type of entity to extract from the paragraph (e.g., 'disease', 'tissue').
Returns:
Optional[list[str]]: A list of extracted entities of the specified type, or
None if the model did not return a valid response.
"""
prompt = NER_PROMPT.format(entity=target_entity, text=paragraph)
extracted_entities = await ask_openai(prompt, usage="ner")
return extracted_entities
async def normalize_entities(raw_terms: list[str]) -> list[Optional[str]]:
"""
Normalize a list of raw terms to the most appropriate standard terms from a list
of candidates.
This function is designed to process the output from extract_entities().
Args:
raw_terms (list[str]): List of unnormalized terms, typically from extract_entities().
Returns:
list[Optional[str]]: List of best matching standard terms in the same order as the
input terms. An entry may be None if normalization failed.
"""
# Do normalization for each entity
async def process_single_entity(raw_term: str) -> Optional[str]:
# Generate candidates specifically for this entity
# If the oaklib function is not async, wrap it with run_in_executor
candidate_std_terms = await asyncio.to_thread(get_candidates, raw_term)
# Now use these entity-specific candidates for the OpenAI call
prompt = RAG_PROMPT.format(entity=raw_term, top_k_preds=candidate_std_terms)
result = await ask_openai(prompt, usage="rag")
return result
# Process all entities in parallel
tasks = [process_single_entity(entity) for entity in raw_terms]
normalized_entities = await asyncio.gather(*tasks)
return normalized_entities
async def extract_and_normalize(
paragraph: str, target_entity: str
) -> list[Optional[str]]:
"""
Extract entities from a paragraph and normalize them in one operation.
Args:
paragraph: The paragraph from which to extract entities.
target_entity: The type of entity to extract and normalize.
Returns:
list[Optional[str]]: List of best matching standard terms in the same order as the
input terms. An entry may be None if normalization failed.
"""
extracted_entities = await extract_entities(paragraph, target_entity)
if not extracted_entities or len(extracted_entities) == 0:
return []
result = await normalize_entities(extracted_entities)
return result
# Create a visually appealing Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Entity Extraction & Normalization")
gr.Markdown(
"Enter text and specify the entity type to extract and normalize entities."
)
with gr.Row():
with gr.Column(scale=3):
paragraph = gr.Textbox(
label="Text Input",
placeholder="Enter paragraph here...",
lines=5,
info="Enter biomedical text input for entity extraction.",
)
with gr.Column(scale=1):
target_entity = gr.Dropdown(
["Disease", "Tissue", "Cell Type"],
label="Entity Type",
value="Disease",
info="Select the type of entity you want to extract and normalize from the text.",
)
normalize_btn = gr.Button("Normalize", variant="primary")
with gr.Row():
with gr.Column():
output = gr.JSON(label="Normalized Entities")
# Add a loading indicator
with gr.Row():
status = gr.Markdown("")
with gr.Accordion("Example Inputs", open=False):
gr.Examples(
examples=[
["The patient was diagnosed with diabetes and hypertension.", "Disease"],
[
"Samples of BA12 tissue, weighing approximately 50-100 mg each, were homogenized in nuclei extraction buffer.",
"Tissue",
],
[
"Coupling scTCR-seq with scRNA-seq can reveal the relationship between clonotype and phenotype in T or B cell populations.",
"Cell Type",
],
],
inputs=[paragraph, target_entity],
)
# Set up the button click event
normalize_btn.click(
lambda: "Processing...", # Show loading immediately
None,
status,
queue=False,
).then(
extract_and_normalize, # Async processing
[paragraph, target_entity],
output,
).then(
lambda: "", # Clear status
None,
status,
)
if __name__ == "__main__":
demo.launch(mcp_server=True)