Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
c7a30f7
1
Parent(s):
c80cc87
add olmo
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@ from PIL import Image
|
|
| 3 |
import xml.etree.ElementTree as ET
|
| 4 |
import os
|
| 5 |
import torch
|
| 6 |
-
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
|
| 7 |
import spaces
|
| 8 |
|
| 9 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
|
@@ -14,7 +14,7 @@ PIPELINES = {}
|
|
| 14 |
MODEL_LOAD_ERROR_MSG = {}
|
| 15 |
|
| 16 |
# Available models
|
| 17 |
-
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s"]
|
| 18 |
|
| 19 |
# Load RolmOCR
|
| 20 |
try:
|
|
@@ -46,6 +46,19 @@ except Exception as e:
|
|
| 46 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
| 47 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# --- Helper Functions ---
|
| 51 |
|
|
@@ -193,7 +206,7 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
| 193 |
],
|
| 194 |
}
|
| 195 |
]
|
| 196 |
-
|
| 197 |
messages = [
|
| 198 |
{
|
| 199 |
"role": "user",
|
|
@@ -206,6 +219,19 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
| 206 |
],
|
| 207 |
}
|
| 208 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
max_tokens = 8096
|
| 210 |
# Use the pipeline with the properly formatted messages
|
| 211 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
|
@@ -354,7 +380,8 @@ with gr.Blocks() as demo:
|
|
| 354 |
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
| 355 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
| 356 |
"**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
|
| 357 |
-
"[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
|
|
|
|
| 358 |
)
|
| 359 |
|
| 360 |
gr.Markdown("---")
|
|
@@ -394,7 +421,7 @@ with gr.Blocks() as demo:
|
|
| 394 |
choices=AVAILABLE_MODELS,
|
| 395 |
value="RolmOCR",
|
| 396 |
label="Choose Model",
|
| 397 |
-
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
|
| 398 |
)
|
| 399 |
|
| 400 |
submit_button = gr.Button(
|
|
@@ -461,6 +488,11 @@ with gr.Blocks() as demo:
|
|
| 461 |
"examples/one/74442232.34.xml",
|
| 462 |
"Nanonets-OCR-s",
|
| 463 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
],
|
| 465 |
inputs=[image_input, xml_input, model_selector],
|
| 466 |
outputs=[
|
|
|
|
| 3 |
import xml.etree.ElementTree as ET
|
| 4 |
import os
|
| 5 |
import torch
|
| 6 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
|
| 7 |
import spaces
|
| 8 |
|
| 9 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
|
|
|
| 14 |
MODEL_LOAD_ERROR_MSG = {}
|
| 15 |
|
| 16 |
# Available models
|
| 17 |
+
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
|
| 18 |
|
| 19 |
# Load RolmOCR
|
| 20 |
try:
|
|
|
|
| 46 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
| 47 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
| 48 |
|
| 49 |
+
# Load olmOCR
|
| 50 |
+
try:
|
| 51 |
+
PROCESSORS["olmOCR"] = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
| 52 |
+
MODELS["olmOCR"] = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 53 |
+
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16, device_map="auto"
|
| 54 |
+
)
|
| 55 |
+
PIPELINES["olmOCR"] = pipeline(
|
| 56 |
+
"image-text-to-text", model=MODELS["olmOCR"], processor=PROCESSORS["olmOCR"]
|
| 57 |
+
)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
|
| 60 |
+
print(f"Error loading olmOCR: {e}")
|
| 61 |
+
|
| 62 |
|
| 63 |
# --- Helper Functions ---
|
| 64 |
|
|
|
|
| 206 |
],
|
| 207 |
}
|
| 208 |
]
|
| 209 |
+
elif model_name == "Nanonets-OCR-s":
|
| 210 |
messages = [
|
| 211 |
{
|
| 212 |
"role": "user",
|
|
|
|
| 219 |
],
|
| 220 |
}
|
| 221 |
]
|
| 222 |
+
else: # olmOCR
|
| 223 |
+
messages = [
|
| 224 |
+
{
|
| 225 |
+
"role": "user",
|
| 226 |
+
"content": [
|
| 227 |
+
{"type": "image", "image": pil_image},
|
| 228 |
+
{
|
| 229 |
+
"type": "text",
|
| 230 |
+
"text": "Extract all text from this document image, preserving the original reading order and layout structure. Return the plain text representation.",
|
| 231 |
+
},
|
| 232 |
+
],
|
| 233 |
+
}
|
| 234 |
+
]
|
| 235 |
max_tokens = 8096
|
| 236 |
# Use the pipeline with the properly formatted messages
|
| 237 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
|
|
|
| 380 |
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
| 381 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
| 382 |
"**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
|
| 383 |
+
"[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) | "
|
| 384 |
+
"[olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview)"
|
| 385 |
)
|
| 386 |
|
| 387 |
gr.Markdown("---")
|
|
|
|
| 421 |
choices=AVAILABLE_MODELS,
|
| 422 |
value="RolmOCR",
|
| 423 |
label="Choose Model",
|
| 424 |
+
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
|
| 425 |
)
|
| 426 |
|
| 427 |
submit_button = gr.Button(
|
|
|
|
| 488 |
"examples/one/74442232.34.xml",
|
| 489 |
"Nanonets-OCR-s",
|
| 490 |
],
|
| 491 |
+
[
|
| 492 |
+
"examples/one/74442232.3.jpg",
|
| 493 |
+
"examples/one/74442232.34.xml",
|
| 494 |
+
"olmOCR",
|
| 495 |
+
],
|
| 496 |
],
|
| 497 |
inputs=[image_input, xml_input, model_selector],
|
| 498 |
outputs=[
|