|
|
""" |
|
|
BioMed text normalization MCP server. |
|
|
""" |
|
|
|
|
|
|
|
|
import asyncio |
|
|
from typing import Optional |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
from oaklib_utils import get_candidates |
|
|
from openai_utils import ask_openai |
|
|
|
|
|
NER_PROMPT = """ |
|
|
You are an expert annotator of biomedical text. |
|
|
|
|
|
Annotate/Identify/Extract all {entity}s in this text: {text} |
|
|
|
|
|
Instructions: |
|
|
1. If no such entity or entities are found, then **return exactly**: Not Found |
|
|
2. Extract only the entity. If only an abbreviation is present, expand it based on the |
|
|
biomedical context in the given paragraph. For e.g., BA12 full form is Brodmann (1909) area 12. |
|
|
3. Do not provide any additional information or formatting. |
|
|
|
|
|
Do not guess or hallucinate if you are uncertain. This has high-stakes, so it's better to be safe |
|
|
than sorry. This is very important, so you'd better be sure of your answer, OK? |
|
|
""" |
|
|
|
|
|
RAG_PROMPT = """ |
|
|
You are an expert normalizer of biomedical entities. |
|
|
|
|
|
Given the following list of candidate standard terms: {top_k_preds}, |
|
|
find the single closest matching term for this unnormalized entity: {entity}. |
|
|
|
|
|
Instructions: |
|
|
1. **IMPORTANT:** Do **NOT** guess or hallucinate. Do **NOT** provide any term that |
|
|
is not explicitly present in the list of standardized terms. |
|
|
2. Do not overgeneralize unless no match is available. |
|
|
3. Do not provide any additional information or formatting. |
|
|
|
|
|
This has high-stakes, so it's better to be safe than sorry. This is very important, so you'd better |
|
|
be sure of your answer, OK? |
|
|
""" |
|
|
|
|
|
|
|
|
async def extract_entities(paragraph: str, target_entity: str) -> Optional[list[str]]: |
|
|
""" |
|
|
Extract entities of a specific type from a given paragraph. |
|
|
|
|
|
Args: |
|
|
paragraph (str): The paragraph from which entities are to be extracted. |
|
|
target_entity (str): The type of entity to extract from the paragraph (e.g., 'disease', 'tissue'). |
|
|
|
|
|
Returns: |
|
|
Optional[list[str]]: A list of extracted entities of the specified type, or |
|
|
None if the model did not return a valid response. |
|
|
""" |
|
|
prompt = NER_PROMPT.format(entity=target_entity, text=paragraph) |
|
|
extracted_entities = await ask_openai(prompt, usage="ner") |
|
|
|
|
|
return extracted_entities |
|
|
|
|
|
|
|
|
async def normalize_entities(raw_terms: list[str]) -> list[Optional[str]]: |
|
|
""" |
|
|
Normalize a list of raw terms to the most appropriate standard terms from a list |
|
|
of candidates. |
|
|
|
|
|
This function is designed to process the output from extract_entities(). |
|
|
|
|
|
Args: |
|
|
raw_terms (list[str]): List of unnormalized terms, typically from extract_entities(). |
|
|
|
|
|
Returns: |
|
|
list[Optional[str]]: List of best matching standard terms in the same order as the |
|
|
input terms. An entry may be None if normalization failed. |
|
|
""" |
|
|
|
|
|
|
|
|
async def process_single_entity(raw_term: str) -> Optional[str]: |
|
|
|
|
|
|
|
|
candidate_std_terms = await asyncio.to_thread(get_candidates, raw_term) |
|
|
|
|
|
|
|
|
prompt = RAG_PROMPT.format(entity=raw_term, top_k_preds=candidate_std_terms) |
|
|
result = await ask_openai(prompt, usage="rag") |
|
|
return result |
|
|
|
|
|
|
|
|
tasks = [process_single_entity(entity) for entity in raw_terms] |
|
|
normalized_entities = await asyncio.gather(*tasks) |
|
|
|
|
|
return normalized_entities |
|
|
|
|
|
|
|
|
async def extract_and_normalize( |
|
|
paragraph: str, target_entity: str |
|
|
) -> list[Optional[str]]: |
|
|
""" |
|
|
Extract entities from a paragraph and normalize them in one operation. |
|
|
|
|
|
Args: |
|
|
paragraph: The paragraph from which to extract entities. |
|
|
target_entity: The type of entity to extract and normalize. |
|
|
|
|
|
Returns: |
|
|
list[Optional[str]]: List of best matching standard terms in the same order as the |
|
|
input terms. An entry may be None if normalization failed. |
|
|
""" |
|
|
extracted_entities = await extract_entities(paragraph, target_entity) |
|
|
if not extracted_entities or len(extracted_entities) == 0: |
|
|
return [] |
|
|
|
|
|
result = await normalize_entities(extracted_entities) |
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# Entity Extraction & Normalization") |
|
|
gr.Markdown( |
|
|
"Enter text and specify the entity type to extract and normalize entities." |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
paragraph = gr.Textbox( |
|
|
label="Text Input", |
|
|
placeholder="Enter paragraph here...", |
|
|
lines=5, |
|
|
info="Enter biomedical text input for entity extraction.", |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
target_entity = gr.Dropdown( |
|
|
["Disease", "Tissue", "Cell Type"], |
|
|
label="Entity Type", |
|
|
value="Disease", |
|
|
info="Select the type of entity you want to extract and normalize from the text.", |
|
|
) |
|
|
|
|
|
normalize_btn = gr.Button("Normalize", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
output = gr.JSON(label="Normalized Entities") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
status = gr.Markdown("") |
|
|
|
|
|
with gr.Accordion("Example Inputs", open=False): |
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["The patient was diagnosed with diabetes and hypertension.", "Disease"], |
|
|
[ |
|
|
"Samples of BA12 tissue, weighing approximately 50-100 mg each, were homogenized in nuclei extraction buffer.", |
|
|
"Tissue", |
|
|
], |
|
|
[ |
|
|
"Coupling scTCR-seq with scRNA-seq can reveal the relationship between clonotype and phenotype in T or B cell populations.", |
|
|
"Cell Type", |
|
|
], |
|
|
], |
|
|
inputs=[paragraph, target_entity], |
|
|
) |
|
|
|
|
|
|
|
|
normalize_btn.click( |
|
|
lambda: "Processing...", |
|
|
None, |
|
|
status, |
|
|
queue=False, |
|
|
).then( |
|
|
extract_and_normalize, |
|
|
[paragraph, target_entity], |
|
|
output, |
|
|
).then( |
|
|
lambda: "", |
|
|
None, |
|
|
status, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(mcp_server=True) |
|
|
|