Spaces:
Paused
Paused
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import base64 | |
| import io | |
| import os | |
| import json | |
| from huggingface_hub import login | |
| from pdf2image import convert_from_bytes | |
| import tempfile | |
| from datetime import datetime | |
| # Set your HF token (add this to your Space secrets) | |
| HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") | |
| if HF_TOKEN: | |
| login(token=HF_TOKEN) | |
| # Global variables for model caching | |
| _model = None | |
| _tokenizer = None | |
| def load_model(): | |
| """Load MiniCPM model on GPU when needed""" | |
| global _model, _tokenizer | |
| if _model is not None and _tokenizer is not None: | |
| return _model, _tokenizer | |
| try: | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| "openbmb/MiniCPM-V-2_6", | |
| trust_remote_code=True, | |
| use_fast=True | |
| ) | |
| _model = AutoModel.from_pretrained( | |
| "openbmb/MiniCPM-V-2_6", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| return _model, _tokenizer | |
| except Exception as e: | |
| print(f"Error loading gated model: {e}") | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| "openbmb/MiniCPM-V-2", | |
| trust_remote_code=True, | |
| use_fast=True | |
| ) | |
| _model = AutoModel.from_pretrained( | |
| "openbmb/MiniCPM-V-2", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| return _model, _tokenizer | |
| def pdf_to_images(pdf_file): | |
| """Convert PDF file to list of PIL images""" | |
| try: | |
| if hasattr(pdf_file, 'read'): | |
| pdf_bytes = pdf_file.read() | |
| else: | |
| with open(pdf_file, 'rb') as f: | |
| pdf_bytes = f.read() | |
| images = convert_from_bytes(pdf_bytes, dpi=300) | |
| return images | |
| except Exception as e: | |
| print(f"Error converting PDF to images: {e}") | |
| return [] | |
| def get_medical_extraction_prompt(): | |
| """Get the medical data extraction prompt""" | |
| return """You are a medical document OCR and data extraction specialist. Analyze this medical document image and extract ALL visible information. Return the data in this exact JSON format: | |
| { | |
| "data": { | |
| "date_of_receipt": "", | |
| "patient_first_name": "", | |
| "patient_last_name": "", | |
| "patient_dob": "", | |
| "patient_gender": "", | |
| "patient_primary_phone_number": "", | |
| "patient_secondary_phone_number": "", | |
| "patient_email": "", | |
| "patient_address": "", | |
| "patient_zip_code": "", | |
| "referral_source": "", | |
| "referral_source_phone_no": "", | |
| "referral_source_fax_no": "", | |
| "referral_source_email": "", | |
| "primary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "secondary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "tertiary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "priority": "", | |
| "reason_for_referral": "", | |
| "diagnosis_informations": [ | |
| { | |
| "code": "", | |
| "description": "" | |
| } | |
| ], | |
| "refine_reason": "" | |
| }, | |
| "confidence_scores": { | |
| "date_of_receipt": 0.0, | |
| "patient_first_name": 0.0, | |
| "patient_last_name": 0.0, | |
| "patient_dob": 0.0, | |
| "patient_gender": 0.0, | |
| "patient_primary_phone_number": 0.0, | |
| "patient_secondary_phone_number": 0.0, | |
| "patient_email": 0.0, | |
| "patient_address": 0.0, | |
| "patient_zip_code": 0.0, | |
| "referral_source": 0.0, | |
| "referral_source_phone_no": 0.0, | |
| "referral_source_fax_no": 0.0, | |
| "referral_source_email": 0.0, | |
| "primary_insurance": { | |
| "payer_name": 0.0, | |
| "member_id": 0.0, | |
| "group_id": 0.0 | |
| }, | |
| "priority": 0.0, | |
| "reason_for_referral": 0.0 | |
| } | |
| } | |
| INSTRUCTIONS: | |
| 1. Read ALL text visible in the document | |
| 2. Extract exact values as they appear (no modifications) | |
| 3. For dates, use MM/DD/YYYY format | |
| 4. For phone numbers, use format like 850-463-0143 | |
| 5. Assign confidence scores 0.0-1.0 (1.0 = completely certain, 0.0 = not found) | |
| 6. If information is not visible, leave field empty but still include it | |
| 7. Return ONLY the JSON, no other text""" | |
| def extract_data_from_image(image, extraction_prompt): | |
| """Extract data from a single image using MiniCPM on GPU""" | |
| try: | |
| model, tokenizer = load_model() | |
| # Convert PIL image to proper format if needed | |
| if hasattr(image, 'convert'): | |
| image = image.convert('RGB') | |
| # Use the correct MiniCPM chat interface | |
| response = model.chat( | |
| image=image, | |
| msgs=[{ | |
| "role": "user", | |
| "content": extraction_prompt | |
| }], | |
| tokenizer=tokenizer, | |
| sampling=False, # Use deterministic output | |
| temperature=0.1, | |
| max_new_tokens=2048 | |
| ) | |
| # Try to parse JSON response | |
| try: | |
| parsed_data = json.loads(response) | |
| return { | |
| "status": "success", | |
| "extracted_data": parsed_data, | |
| "raw_response": response, | |
| "model_used": "MiniCPM-V-2_6-GPU" | |
| } | |
| except json.JSONDecodeError: | |
| return { | |
| "status": "partial_success", | |
| "extracted_data": response, | |
| "raw_response": response, | |
| "model_used": "MiniCPM-V-2_6-GPU", | |
| "note": "Response was not valid JSON" | |
| } | |
| except Exception as e: | |
| return { | |
| "status": "error", | |
| "error": str(e), | |
| "extracted_data": None | |
| } | |
| def combine_page_data(pages_data): | |
| """Combine extracted data from multiple pages into final medical record""" | |
| combined_data = { | |
| "date_of_receipt": "", | |
| "patient_first_name": "", | |
| "patient_last_name": "", | |
| "patient_dob": "", | |
| "patient_gender": "", | |
| "patient_primary_phone_number": "", | |
| "patient_secondary_phone_number": "", | |
| "patient_email": "", | |
| "patient_address": "", | |
| "patient_zip_code": "", | |
| "referral_source": "", | |
| "referral_source_phone_no": "", | |
| "referral_source_fax_no": "", | |
| "referral_source_email": "", | |
| "primary_insurance": { | |
| "payer_name": "", | |
| "member_id": "", | |
| "group_id": "" | |
| }, | |
| "secondary_insurance": { | |
| "payer_name": None, | |
| "member_id": None, | |
| "group_id": None | |
| }, | |
| "tertiary_insurance": { | |
| "payer_name": None, | |
| "member_id": None, | |
| "group_id": None | |
| }, | |
| "priority": "", | |
| "reason_for_referral": "", | |
| "diagnosis_informations": [], | |
| "refine_reason": "", | |
| "extracted_page_numbers": [] | |
| } | |
| combined_confidence = {} | |
| # Combine data from all pages | |
| for page_num, page_data in enumerate(pages_data, 1): | |
| if page_data["page_data"]["status"] == "success": | |
| extracted = page_data["page_data"]["extracted_data"] | |
| # If we got JSON data, merge it | |
| if isinstance(extracted, dict) and "data" in extracted: | |
| page_info = extracted["data"] | |
| # Merge non-empty fields (first non-empty value wins) | |
| for field, value in page_info.items(): | |
| if field in combined_data and value and not combined_data[field]: | |
| combined_data[field] = value | |
| combined_data["extracted_page_numbers"].append(page_num) | |
| # Merge confidence scores | |
| if "confidence_scores" in extracted: | |
| for field, score in extracted["confidence_scores"].items(): | |
| if field not in combined_confidence and score > 0: | |
| combined_confidence[field] = score | |
| return { | |
| "data": combined_data, | |
| "confidence_scores": combined_confidence, | |
| "fields_needing_review": [], | |
| "metadata": { | |
| "extraction_timestamp": datetime.now().isoformat(), | |
| "model_used": "MiniCPM-V-2_6-GPU", | |
| "confidence_threshold": 0.9, | |
| "requires_human_review": False, | |
| "total_pages_processed": len(pages_data) | |
| } | |
| } | |
| # 3 minutes for processing | |
| def extract_efax_from_pdf(pdf_file, custom_prompt=None): | |
| """Main function to process multi-page PDF eFax on GPU""" | |
| try: | |
| if pdf_file is None: | |
| return { | |
| "status": "error", | |
| "error": "No PDF file provided", | |
| "total_pages": 0, | |
| "pages_data": [] | |
| } | |
| # Convert PDF to images | |
| images = pdf_to_images(pdf_file) | |
| if not images: | |
| return { | |
| "status": "error", | |
| "error": "Could not convert PDF to images", | |
| "total_pages": 0, | |
| "pages_data": [] | |
| } | |
| # Use custom prompt or default medical extraction prompt | |
| extraction_prompt = custom_prompt if custom_prompt else get_medical_extraction_prompt() | |
| # Process each page | |
| pages_data = [] | |
| for i, image in enumerate(images): | |
| print(f"Processing page {i+1}/{len(images)}") | |
| page_result = extract_data_from_image(image, extraction_prompt) | |
| pages_data.append({ | |
| "page_number": i + 1, | |
| "page_data": page_result | |
| }) | |
| # Combine data from all pages | |
| combined_result = combine_page_data(pages_data) | |
| # Final result structure | |
| result = { | |
| "status": "success", | |
| "total_pages": len(images), | |
| "pages_data": pages_data, | |
| "combined_extraction": combined_result, | |
| "model_used": "MiniCPM-V-2_6-ZeroGPU", | |
| "hardware": "ZeroGPU" | |
| } | |
| return result | |
| except Exception as e: | |
| return { | |
| "status": "error", | |
| "error": str(e), | |
| "total_pages": 0, | |
| "pages_data": [] | |
| } | |
| # Create Gradio Interface | |
| def create_gradio_interface(): | |
| with gr.Blocks(title="eFax PDF Data Extractor - ZeroGPU", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π₯ eFax Medical Data Extraction API") | |
| gr.Markdown("π **GPU-Accelerated** processing using MiniCPM-V-2_6 on ZeroGPU") | |
| with gr.Tab("π PDF Upload & Extraction"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| file_types=[".pdf"], | |
| label="Upload eFax PDF", | |
| file_count="single" | |
| ) | |
| with gr.Accordion("π§ Advanced Options", open=False): | |
| prompt_input = gr.Textbox( | |
| value="", | |
| label="Custom Extraction Prompt (leave empty for default medical extraction)", | |
| lines=5, | |
| placeholder="Leave empty to use optimized medical data extraction prompt..." | |
| ) | |
| extract_btn = gr.Button("π Extract Medical Data (GPU)", variant="primary", size="lg") | |
| with gr.Column(): | |
| status_output = gr.Textbox(label="π Processing Status", interactive=False) | |
| output = gr.JSON(label="π Extracted Medical Data", show_label=True) | |
| with gr.Tab("π API Usage"): | |
| gr.Markdown(""" | |
| ## API Endpoints (ZeroGPU Powered) | |
| Your Space runs on **ZeroGPU** for 10-50x faster processing! | |
| ### Python API Usage | |
| ``` | |
| import requests | |
| import base64 | |
| # Convert PDF to base64 | |
| with open("medical_fax.pdf", "rb") as f: | |
| pdf_b64 = base64.b64encode(f.read()).decode() | |
| response = requests.post( | |
| "https://your-username-extracting-efax.hf.space/api/predict", | |
| json={ | |
| "data": [ | |
| {"name": "medical_fax.pdf", "data": f"application/pdf;base64,{pdf_b64}"}, | |
| "" # Leave empty for default prompt | |
| ] | |
| } | |
| ) | |
| result = response.json() | |
| # Access combined medical data | |
| medical_data = result["data"]["combined_extraction"] | |
| print("Patient:", medical_data["data"]["patient_first_name"], medical_data["data"]["patient_last_name"]) | |
| print("Insurance:", medical_data["data"]["primary_insurance"]["payer_name"]) | |
| ``` | |
| ### Response Format | |
| ``` | |
| { | |
| "status": "success", | |
| "total_pages": 13, | |
| "combined_extraction": { | |
| "data": { | |
| "patient_first_name": "John", | |
| "patient_last_name": "Doe", | |
| "primary_insurance": { | |
| "payer_name": "UNITED HEALTHCARE", | |
| "member_id": "123456789" | |
| } | |
| }, | |
| "confidence_scores": {...}, | |
| "metadata": {...} | |
| } | |
| } | |
| ``` | |
| """) | |
| with gr.Tab("β‘ Performance Info"): | |
| gr.Markdown(""" | |
| ## ZeroGPU Performance | |
| - **π₯ Hardware**: ZeroGPU (70GB VRAM) | |
| - **β‘ Speed**: 10-50x faster than CPU processing | |
| - **β±οΈ Processing Time**: 2-5 minutes for 6-13 page eFax | |
| - **π€ Model**: MiniCPM-V-2_6 optimized for medical documents | |
| - **π‘ Dynamic Allocation**: GPU activates only during processing | |
| ## Medical Data Extracted | |
| - β Patient Demographics (Name, DOB, Gender, Address) | |
| - β Contact Information (Phone, Email) | |
| - β Insurance Information (Primary, Secondary, Tertiary) | |
| - β Medical Codes & Diagnoses | |
| - β Referral Source & Priority | |
| - β Confidence Scores for Quality Control | |
| ## HIPAA Compliance | |
| - π All processing in-memory (no persistent storage) | |
| - π‘οΈ Secure GPU processing environment | |
| - π Audit trail with confidence scores | |
| """) | |
| def process_with_status(pdf_file, custom_prompt): | |
| if pdf_file is None: | |
| return "β No PDF file uploaded", {"error": "Please upload a PDF file"} | |
| yield "π Converting PDF to images...", {} | |
| try: | |
| result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None) | |
| if result["status"] == "success": | |
| yield f"β Successfully processed {result['total_pages']} pages", result | |
| else: | |
| yield f"β Error: {result.get('error', 'Unknown error')}", result | |
| except Exception as e: | |
| yield f"β Processing failed: {str(e)}", {"error": str(e)} | |
| # Connect the interface | |
| extract_btn.click( | |
| fn=process_with_status, | |
| inputs=[pdf_input, prompt_input], | |
| outputs=[status_output, output], | |
| queue=True | |
| ) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.queue( | |
| default_concurrency_limit=1, | |
| max_size=10 | |
| ).launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) | |