Spaces:
Paused
Paused
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import base64 | |
| import io | |
| import os | |
| import json | |
| from huggingface_hub import login | |
| from pdf2image import convert_from_bytes | |
| from datetime import datetime | |
| # Set your HF token | |
| HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") | |
| if HF_TOKEN: | |
| login(token=HF_TOKEN) | |
| # Global variables for model caching | |
| _model = None | |
| _tokenizer = None | |
| def load_model(): | |
| """Load MiniCPM model""" | |
| global _model, _tokenizer | |
| if _model is not None and _tokenizer is not None: | |
| return _model, _tokenizer | |
| try: | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| "openbmb/MiniCPM-V-2_6", | |
| trust_remote_code=True, | |
| use_fast=True | |
| ) | |
| _model = AutoModel.from_pretrained( | |
| "openbmb/MiniCPM-V-2_6", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| return _model, _tokenizer | |
| except Exception as e: | |
| print(f"Error loading gated model: {e}") | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| "openbmb/MiniCPM-V-2", | |
| trust_remote_code=True, | |
| use_fast=True | |
| ) | |
| _model = AutoModel.from_pretrained( | |
| "openbmb/MiniCPM-V-2", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| return _model, _tokenizer | |
| def pdf_to_images(pdf_file): | |
| """Convert PDF file to list of PIL images""" | |
| try: | |
| if hasattr(pdf_file, 'read'): | |
| pdf_bytes = pdf_file.read() | |
| else: | |
| with open(pdf_file, 'rb') as f: | |
| pdf_bytes = f.read() | |
| images = convert_from_bytes(pdf_bytes, dpi=300) | |
| return images | |
| except Exception as e: | |
| print(f"Error converting PDF to images: {e}") | |
| return [] | |
| def clean_empty_fields(data): | |
| """Recursively remove empty fields from dictionary""" | |
| if not isinstance(data, dict): | |
| return data | |
| cleaned = {} | |
| for key, value in data.items(): | |
| if isinstance(value, dict): | |
| cleaned_value = clean_empty_fields(value) | |
| if cleaned_value: # Only add if not empty | |
| cleaned[key] = cleaned_value | |
| elif isinstance(value, list): | |
| if value: # Only add if list is not empty | |
| cleaned_list = [] | |
| for item in value: | |
| if isinstance(item, dict): | |
| cleaned_item = clean_empty_fields(item) | |
| if cleaned_item: | |
| cleaned_list.append(cleaned_item) | |
| elif item: # Not empty | |
| cleaned_list.append(item) | |
| if cleaned_list: | |
| cleaned[key] = cleaned_list | |
| elif value not in [None, "", [], {}]: # Not empty | |
| cleaned[key] = value | |
| return cleaned | |
| def get_comprehensive_medical_extraction_prompt(): | |
| """Complete medical data extraction prompt with all fields""" | |
| return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below. | |
| Your response MUST follow this exact JSON format: | |
| { | |
| "page_analysis": { | |
| "page_contains_text": true, | |
| "page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other", | |
| "overall_page_confidence": 0.0, | |
| "all_visible_text": "Complete text transcription of everything visible on this page" | |
| }, | |
| "extracted_data": { | |
| "date_of_receipt": "", | |
| "patient_first_name": "", | |
| "patient_last_name": "", | |
| "patient_dob": "", | |
| "patient_gender": "", | |
| "patient_primary_phone_number": "", | |
| "patient_secondary_phone_number": "", | |
| "patient_email": "", | |
| "patient_address": "", | |
| "patient_zip_code": "", | |
| "referral_source": "", | |
| "referral_source_phone_no": "", | |
| "referral_source_fax_no": "", | |
| "referral_source_email": "", | |
| "primary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "secondary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "tertiary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "priority": "", | |
| "reason_for_referral": "", | |
| "diagnosis_informations": [ | |
| { | |
| "code": "", | |
| "description": "" | |
| } | |
| ], | |
| "refine_reason": "", | |
| "additional_medical_info": "", | |
| "provider_names": [], | |
| "appointment_dates": [], | |
| "medication_info": [], | |
| "other_important_details": "" | |
| }, | |
| "confidence_scores": { | |
| "date_of_receipt": 0.0, | |
| "patient_first_name": 0.0, | |
| "patient_last_name": 0.0, | |
| "patient_dob": 0.0, | |
| "patient_gender": 0.0, | |
| "patient_primary_phone_number": 0.0, | |
| "patient_secondary_phone_number": 0.0, | |
| "patient_email": 0.0, | |
| "patient_address": 0.0, | |
| "patient_zip_code": 0.0, | |
| "referral_source": 0.0, | |
| "referral_source_phone_no": 0.0, | |
| "referral_source_fax_no": 0.0, | |
| "referral_source_email": 0.0, | |
| "primary_insurance": { | |
| "payer_name": 0.0, | |
| "member_id": 0.0, | |
| "group_id": 0.0 | |
| }, | |
| "secondary_insurance": { | |
| "payer_name": 0.0, | |
| "member_id": 0.0, | |
| "group_id": 0.0 | |
| }, | |
| "tertiary_insurance": { | |
| "payer_name": 0.0, | |
| "member_id": 0.0, | |
| "group_id": 0.0 | |
| }, | |
| "priority": 0.0, | |
| "reason_for_referral": 0.0, | |
| "diagnosis_informations": 0.0, | |
| "refine_reason": 0.0 | |
| }, | |
| "fields_found_on_this_page": [], | |
| "metadata": { | |
| "extraction_timestamp": "", | |
| "model_used": "MiniCPM-V-2_6-GPU", | |
| "page_processing_notes": "" | |
| } | |
| } | |
| -------------------------------- | |
| STRICT FIELD FORMATTING RULES: | |
| -------------------------------- | |
| β’ Dates: Format as MM/DD/YYYY only | |
| β’ Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses | |
| β’ Gender: "Male", "Female", or "Other" only | |
| β’ Email: Must contain @ and valid domain, otherwise leave empty | |
| β’ Zip code: Only extract as last 5 digits of address | |
| -------------------------------- | |
| REFERRAL SOURCE RULES: | |
| -------------------------------- | |
| β’ Extract clinic/hospital/facility name ONLY β never the provider's name | |
| β’ Use facility's phone/fax/email, not individual provider's contact | |
| β’ Prefer header/fax banner for referral source over body text | |
| β’ Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source | |
| -------------------------------- | |
| INSURANCE EXTRACTION FORMAT: | |
| -------------------------------- | |
| Each tier must follow this structure: | |
| "primary_insurance": { | |
| "payer_name": "string", | |
| "member_id": "string", | |
| "group_id": "string" | |
| }, | |
| "secondary_insurance": { ... }, | |
| "tertiary_insurance": { ... } | |
| β’ Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.) | |
| β’ Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc. | |
| β’ Leave all fields empty if "Self Pay" is indicated | |
| -------------------------------- | |
| DIAGNOSIS EXTRACTION RULES: | |
| -------------------------------- | |
| β’ Extract diagnosis codes AND their descriptions | |
| β’ If only code is present, set description to "" and confidence β€ 0.6 | |
| β’ DO NOT infer description from ICD code | |
| -------------------------------- | |
| CONFIDENCE SCORING: | |
| -------------------------------- | |
| Assign realistic confidence (0.0β1.0) per field, e.g.: | |
| β’ 0.95β1.0 β Clearly labeled, unambiguous data | |
| β’ 0.7β0.94 β Some uncertainty (low quality, odd format) | |
| β’ 0.0β0.6 β Missing, ambiguous, or noisy data | |
| β’ Use float precision (e.g., 0.87, not just 1.0) | |
| Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`. | |
| -------------------------------- | |
| CRITICAL INSTRUCTIONS: | |
| -------------------------------- | |
| 1. READ EVERYTHING: Transcribe all visible text in "all_visible_text" | |
| 2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page | |
| 3. NO ASSUMPTIONS: Don't guess or infer information not present | |
| 4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page" | |
| 5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain | |
| 6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly | |
| 7. JSON ONLY: Return only valid JSON, no other text | |
| This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page.""" | |
| def extract_single_page(image, extraction_prompt, model, tokenizer): | |
| """Extract data from a single page with comprehensive medical fields""" | |
| try: | |
| if hasattr(image, 'convert'): | |
| image = image.convert('RGB') | |
| response = model.chat( | |
| image=image, | |
| msgs=[{ | |
| "role": "user", | |
| "content": extraction_prompt | |
| }], | |
| tokenizer=tokenizer, | |
| sampling=False, | |
| temperature=0.1, | |
| max_new_tokens=4000 | |
| ) | |
| # Try to parse JSON | |
| try: | |
| parsed_data = json.loads(response) | |
| # Clean empty fields | |
| cleaned_data = clean_empty_fields(parsed_data) | |
| return cleaned_data if cleaned_data else None | |
| except json.JSONDecodeError: | |
| return None | |
| except Exception as e: | |
| print(f"Error extracting from page: {e}") | |
| return None | |
| # 10 minutes | |
| def extract_pages_clean_json(pdf_file, custom_prompt=None): | |
| """Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA""" | |
| try: | |
| if pdf_file is None: | |
| return {"error": "No PDF provided"} | |
| # Convert PDF to images | |
| print("Converting PDF to images...") | |
| images = pdf_to_images(pdf_file) | |
| if not images: | |
| return {"error": "Could not convert PDF"} | |
| print(f"Processing {len(images)} pages individually...") | |
| # Load model once | |
| model, tokenizer = load_model() | |
| extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt() | |
| # Process each page and collect only non-empty JSON | |
| page_results = {} | |
| for i, image in enumerate(images): | |
| print(f"Extracting page {i+1}/{len(images)}...") | |
| page_json = extract_single_page(image, extraction_prompt, model, tokenizer) | |
| # Only add to results if page contains data | |
| if page_json: | |
| page_results[f"page_{i+1}"] = page_json | |
| return page_results # Return only pages with data | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def create_gradio_interface(): | |
| with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π₯ Clean Medical eFax Data Extractor") | |
| gr.Markdown("π **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields") | |
| with gr.Tab("π Clean JSON Extraction"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| file_types=[".pdf"], | |
| label="Upload Medical eFax PDF", | |
| file_count="single" | |
| ) | |
| with gr.Accordion("π§ Custom Prompt", open=False): | |
| prompt_input = gr.Textbox( | |
| value="", | |
| label="Custom Extraction Prompt (optional)", | |
| lines=4, | |
| placeholder="Leave empty for comprehensive medical extraction..." | |
| ) | |
| extract_btn = gr.Button("π Extract Clean JSON", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### β Clean Output Features | |
| - **No Empty Fields**: Only fields with actual data | |
| - **No Empty Pages**: Only pages containing information | |
| - **Easier Combination**: Clean structure for AI merging | |
| - **Optimized Size**: Reduced JSON payload | |
| """) | |
| with gr.Column(): | |
| status_output = gr.Textbox(label="π Processing Status", interactive=False) | |
| output = gr.JSON(label="π Clean JSON Results", show_label=True) | |
| with gr.Tab("π API Usage Instructions"): | |
| gr.Markdown(""" | |
| ## Updated API Instructions | |
| ### Method 1: Python Client (Recommended) | |
| ``` | |
| pip install gradio_client | |
| ``` | |
| ``` | |
| from gradio_client import Client, handle_file | |
| import json | |
| # Connect to your deployed Space | |
| client = Client("crimsons-uv/miniCPM") | |
| # Extract medical data from eFax PDF | |
| def extract_efax_clean(pdf_path, custom_prompt=""): | |
| result = client.predict( | |
| pdf_file=handle_file(pdf_path), | |
| custom_prompt=custom_prompt, | |
| api_name="/process_with_status" | |
| ) | |
| # result is tuple: [status_message, clean_json_data] | |
| status, clean_data = result | |
| print(f"Status: {status}") | |
| # Process only pages with data | |
| for page_key, page_data in clean_data.items(): | |
| if page_key.startswith('page_'): | |
| print(f"\\n{page_key.upper()}:") | |
| if 'extracted_data' in page_ | |
| data = page_data['extracted_data'] | |
| if 'patient_first_name' in | |
| print(f" Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}") | |
| if 'primary_insurance' in | |
| print(f" Insurance: {data['primary_insurance'].get('payer_name', '')}") | |
| if 'reason_for_referral' in | |
| print(f" Reason: {data['reason_for_referral']}") | |
| return clean_data | |
| # Usage | |
| results = extract_efax_clean("path/to/your/efax.pdf") | |
| ``` | |
| ### Method 2: cURL Commands | |
| ``` | |
| # Step 1: Make POST request | |
| curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{ | |
| "data": [ | |
| {"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}}, | |
| "" | |
| ] | |
| }' \\ | |
| | awk -F'"' '{ print $4}' \\ | |
| | read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID | |
| ``` | |
| ### Method 3: Direct HTTP API | |
| ``` | |
| import requests | |
| import base64 | |
| import json | |
| def call_clean_extraction_api(pdf_path, custom_prompt=""): | |
| # Read and encode PDF | |
| with open(pdf_path, 'rb') as f: | |
| pdf_b64 = base64.b64encode(f.read()).decode() | |
| # API payload | |
| payload = { | |
| "data": [ | |
| {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"}, | |
| custom_prompt | |
| ] | |
| } | |
| # Make request | |
| response = requests.post( | |
| "https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status", | |
| json=payload, | |
| headers={"Content-Type": "application/json"} | |
| ) | |
| return response.json() | |
| # Usage | |
| clean_results = call_clean_extraction_api("your_efax.pdf") | |
| ``` | |
| """) | |
| with gr.Tab("π Response Format"): | |
| gr.Markdown(""" | |
| ## Clean Response Structure | |
| ### Input: 5-page PDF with mixed content | |
| ### Output: Only pages with data | |
| ``` | |
| { | |
| "page_2": { | |
| "page_analysis": { | |
| "page_type": "patient_demographics", | |
| "overall_page_confidence": 0.95, | |
| "all_visible_text": "Patient: John Doe..." | |
| }, | |
| "extracted_data": { | |
| "patient_first_name": "John", | |
| "patient_last_name": "Doe", | |
| "patient_dob": "01/15/1980", | |
| "patient_gender": "Male", | |
| "patient_primary_phone_number": "555-123-4567", | |
| "patient_address": "123 Main St, City, State 12345", | |
| "patient_zip_code": "12345" | |
| }, | |
| "confidence_scores": { | |
| "patient_first_name": 1.0, | |
| "patient_last_name": 1.0, | |
| "patient_dob": 0.95, | |
| "patient_gender": 1.0 | |
| }, | |
| "fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"] | |
| }, | |
| "page_3": { | |
| "extracted_data": { | |
| "primary_insurance": { | |
| "payer_name": "Blue Cross Blue Shield", | |
| "member_id": "ABC123456789", | |
| "group_id": "GRP001" | |
| }, | |
| "reason_for_referral": "Cardiology consultation" | |
| }, | |
| "confidence_scores": { | |
| "primary_insurance": { | |
| "payer_name": 1.0, | |
| "member_id": 0.98, | |
| "group_id": 0.95 | |
| }, | |
| "reason_for_referral": 1.0 | |
| } | |
| } | |
| } | |
| ``` | |
| ### Benefits for AI Combination: | |
| - β **No empty pages**: Pages 1, 4, 5 had no data, so not included | |
| - β **No empty fields**: Only fields with actual values | |
| - β **Smaller payload**: Reduced data size for faster processing | |
| - β **Easy merging**: Clear structure for combining with ChatGPT/Claude | |
| """) | |
| def process_with_status(pdf_file, custom_prompt): | |
| if pdf_file is None: | |
| return "β No PDF uploaded", {"error": "Upload a PDF file"} | |
| yield "π Converting PDF to images...", {} | |
| try: | |
| result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None) | |
| if "error" not in result: | |
| page_count = len([k for k in result.keys() if k.startswith("page_")]) | |
| yield f"β Extracted clean data from {page_count} pages with content", result | |
| else: | |
| yield f"β Error: {result['error']}", result | |
| except Exception as e: | |
| yield f"β Failed: {str(e)}", {"error": str(e)} | |
| extract_btn.click( | |
| fn=process_with_status, | |
| inputs=[pdf_input, prompt_input], | |
| outputs=[status_output, output], | |
| queue=True | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.queue( | |
| default_concurrency_limit=1, | |
| max_size=10 | |
| ).launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) | |