Spaces:
Runtime error
Runtime error
| from transformers import MllamaForConditionalGeneration, AutoProcessor | |
| from PIL import Image | |
| import torch | |
| import gradio as gr | |
| import spaces | |
| # Initialize model and processor | |
| ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct" | |
| model = MllamaForConditionalGeneration.from_pretrained( | |
| ckpt, | |
| torch_dtype=torch.bfloat16 | |
| ).to("cuda") | |
| processor = AutoProcessor.from_pretrained(ckpt) | |
| def extract_text(image): | |
| # Convert image to RGB | |
| image = Image.open(image).convert("RGB") | |
| prompt = ( | |
| "Output ONLY the raw text exactly as it appears in the image. Do not add anything.\n\n" | |
| "The image may contain both handwritten and printed text in French and/or English, including punctuation and underscores.\n\n" | |
| "Your task: Transcribe all visible text exactly, preserving:\n" | |
| "- All characters, accents, punctuation, spacing, and line breaks.\n" | |
| "- The original reading order and layout, including tables and forms if present.\n\n" | |
| "Rules:\n" | |
| "- Do NOT add any explanations, summaries, comments, or extra text.\n" | |
| "- Do NOT duplicate any content.\n" | |
| "- Do NOT indicate blank space.\n" | |
| "- Do NOT separate handwritten and printed text.\n" | |
| "- Do NOT confuse '.' (a period) with '|' (a border).\n\n" | |
| "Only extract the text that is actually visible in the image, and nothing else.") | |
| # Create message structure | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| {"type": "image"} | |
| ] | |
| } | |
| ] | |
| # Process input | |
| texts = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda") | |
| # Generate output | |
| outputs = model.generate(**inputs, max_new_tokens=250) | |
| result = processor.decode(outputs[0], skip_special_tokens=True) | |
| print(result) | |
| # Clean up the output to remove the prompt and assistant text | |
| if "assistant" in result.lower(): | |
| result = result[result.lower().find("assistant") + len("assistant"):].strip() | |
| # Remove any remaining conversation markers | |
| result = result.replace("user", "").replace(prompt, "").strip() | |
| print(result) | |
| return result | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=extract_text, | |
| inputs=gr.Image(type="filepath", label="Upload Image"), | |
| outputs=gr.Textbox(label="Extracted Text"), | |
| title="Handwritten Text Extractor", | |
| description="Upload an image containing handwritten text to extract its content.", | |
| ) | |
| # Launch the app | |
| demo.launch(debug=True) |