prithivMLmods commited on
Commit
85c9aea
·
verified ·
1 Parent(s): 5666451
Files changed (3) hide show
  1. app.py +317 -0
  2. pre-requirements.txt +1 -0
  3. requirements.txt +38 -0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import spaces
4
+ import re
5
+ import time
6
+ import click
7
+ import gradio as gr
8
+ from io import BytesIO
9
+ from PIL import Image
10
+ from loguru import logger
11
+ from pathlib import Path
12
+ import torch
13
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
14
+ from transformers.image_utils import load_image
15
+ import fitz
16
+ import html2text
17
+ import markdown
18
+ import tempfile
19
+ from typing import Optional, Tuple, Dict, Any, List
20
+
21
+ pdf_suffixes = [".pdf"]
22
+ image_suffixes = [".png", ".jpeg", ".jpg"]
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ logger.info(f"Using device: {device}")
26
+
27
+ # Model 1: Logics-Parsing
28
+ MODEL_ID_1 = "Logics-MLLM/Logics-Parsing"
29
+ logger.info(f"Loading model 1: {MODEL_ID_1}")
30
+ processor_1 = AutoProcessor.from_pretrained(MODEL_ID_1, trust_remote_code=True)
31
+ model_1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
32
+ MODEL_ID_1,
33
+ trust_remote_code=True,
34
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32
35
+ ).to(device).eval()
36
+ logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
37
+
38
+ # Model 2: Gliese-OCR-7B-Post1.0
39
+ MODEL_ID_2 = "prithivMLmods/Gliese-OCR-7B-Post1.0"
40
+ logger.info(f"Loading model 2: {MODEL_ID_2}")
41
+ processor_2 = AutoProcessor.from_pretrained(MODEL_ID_2, trust_remote_code=True)
42
+ model_2 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
+ MODEL_ID_2,
44
+ trust_remote_code=True,
45
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32
46
+ ).to(device).eval()
47
+ logger.info(f"Model '{MODEL_ID_2}' loaded successfully.")
48
+
49
+ @spaces.GPU
50
+ def parse_page(image: Image.Image, model_name: str) -> str:
51
+ """
52
+ Parses a single document page image using the selected model.
53
+ """
54
+ if model_name == "Logics-Parsing":
55
+ current_processor, current_model = processor_1, model_1
56
+ elif model_name == "Gliese-OCR-7B-Post1.0":
57
+ current_processor, current_model = processor_2, model_2
58
+ else:
59
+ raise ValueError(f"Unknown model choice: {model_name}")
60
+
61
+ messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
62
+ prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
63
+ inputs = current_processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
64
+
65
+ with torch.no_grad():
66
+ generated_ids = current_model.generate(**inputs, max_new_tokens=2048, temperature=0.1, top_p=0.9, do_sample=True, repetition_penalty=1.05)
67
+
68
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
69
+ output_text = current_processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
70
+ return output_text
71
+
72
+ def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
73
+ """
74
+ Converts a PDF or image file into a list of PIL Images.
75
+ """
76
+ images = []
77
+ file_ext = Path(file_path).suffix.lower()
78
+
79
+ if file_ext in image_suffixes:
80
+ images.append(Image.open(file_path).convert("RGB"))
81
+ return images
82
+
83
+ if file_ext not in pdf_suffixes:
84
+ raise ValueError(f"Unsupported file type: {file_ext}")
85
+
86
+ try:
87
+ pdf_document = fitz.open(file_path)
88
+ zoom = dpi / 72.0
89
+ mat = fitz.Matrix(zoom, zoom)
90
+ for page_num in range(len(pdf_document)):
91
+ page = pdf_document.load_page(page_num)
92
+ pix = page.get_pixmap(matrix=mat)
93
+ img_data = pix.tobytes("png")
94
+ images.append(Image.open(BytesIO(img_data)))
95
+ pdf_document.close()
96
+ except Exception as e:
97
+ logger.error(f"Failed to convert PDF using PyMuPDF: {e}")
98
+ raise
99
+ return images
100
+
101
+ def get_initial_state() -> Dict[str, Any]:
102
+ """Returns the default initial state for the application."""
103
+ return {"pages": [], "total_pages": 0, "current_page_index": 0, "page_results": []}
104
+
105
+ def load_and_preview_file(file_path: Optional[str]) -> Tuple[Optional[Image.Image], str, Dict[str, Any]]:
106
+ """
107
+ Loads a file, converts all pages to images, and stores them in the state.
108
+ """
109
+ state = get_initial_state()
110
+ if not file_path:
111
+ return None, '<div class="page-info">No file loaded</div>', state
112
+
113
+ try:
114
+ pages = convert_file_to_images(file_path)
115
+ if not pages:
116
+ return None, '<div class="page-info">Could not load file</div>', state
117
+
118
+ state["pages"] = pages
119
+ state["total_pages"] = len(pages)
120
+ page_info_html = f'<div class="page-info">Page 1 / {state["total_pages"]}</div>'
121
+ return pages[0], page_info_html, state
122
+ except Exception as e:
123
+ logger.error(f"Failed to load and preview file: {e}")
124
+ return None, '<div class="page-info">Failed to load preview</div>', state
125
+
126
+ async def process_all_pages(state: Dict[str, Any], model_choice: str):
127
+ """
128
+ Processes all pages stored in the state and updates the state with results.
129
+ """
130
+ if not state or not state["pages"]:
131
+ error_msg = "<h3>Please upload a file first.</h3>"
132
+ return error_msg, "", "", None, "Error: No file to process", state
133
+
134
+ logger.info(f'Processing {state["total_pages"]} pages with model: {model_choice}')
135
+ start_time = time.time()
136
+
137
+ try:
138
+ page_results = []
139
+ for i, page_img in enumerate(state["pages"]):
140
+ logger.info(f"Parsing page {i + 1}/{state['total_pages']}")
141
+ html_result = parse_page(page_img, model_choice)
142
+ page_results.append({'raw_html': html_result})
143
+
144
+ state["page_results"] = page_results
145
+
146
+ # Create a single markdown file for download with all content
147
+ full_html_content = "\n\n".join([f'<!-- Page {i+1} -->\n{res["raw_html"]}' for i, res in enumerate(page_results)])
148
+ full_markdown = html2text.html2text(full_html_content)
149
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
150
+ f.write(full_markdown)
151
+ md_path = f.name
152
+
153
+ parsing_time = time.time() - start_time
154
+ cost_time_str = f'Total processing time: {parsing_time:.2f}s'
155
+
156
+ # Display the results for the current page
157
+ current_page_results = get_page_outputs(state)
158
+
159
+ return *current_page_results, md_path, cost_time_str, state
160
+
161
+ except Exception as e:
162
+ logger.error(f"Parsing failed: {e}", exc_info=True)
163
+ error_html = f"<h3>An error occurred during processing:</h3><p>{str(e)}</p>"
164
+ return error_html, "", "", None, f"Error: {str(e)}", state
165
+
166
+ def navigate_page(direction: str, state: Dict[str, Any]):
167
+ """
168
+ Navigates to the previous or next page and updates the UI accordingly.
169
+ """
170
+ if not state or not state["pages"]:
171
+ return None, '<div class="page-info">No file loaded</div>', *get_page_outputs(state), state
172
+
173
+ current_index = state["current_page_index"]
174
+ total_pages = state["total_pages"]
175
+
176
+ if direction == "prev":
177
+ new_index = max(0, current_index - 1)
178
+ elif direction == "next":
179
+ new_index = min(total_pages - 1, current_index + 1)
180
+ else:
181
+ new_index = current_index
182
+
183
+ state["current_page_index"] = new_index
184
+
185
+ image_preview = state["pages"][new_index]
186
+ page_info_html = f'<div class="page-info">Page {new_index + 1} / {total_pages}</div>'
187
+
188
+ page_outputs = get_page_outputs(state)
189
+
190
+ return image_preview, page_info_html, *page_outputs, state
191
+
192
+ def get_page_outputs(state: Dict[str, Any]) -> Tuple[str, str, str]:
193
+ """Helper to get displayable outputs for the current page."""
194
+ if not state or not state.get("page_results"):
195
+ return "<h3>Process the document to see results.</h3>", "", ""
196
+
197
+ index = state["current_page_index"]
198
+ result = state["page_results"][index]
199
+ raw_html = result['raw_html']
200
+
201
+ mmd_source = html2text.html2text(raw_html)
202
+ mmd_render = markdown.markdown(mmd_source, extensions=['fenced_code', 'tables'])
203
+
204
+ return mmd_render, mmd_source, raw_html
205
+
206
+ def clear_all():
207
+ """Clears all UI components and resets the state."""
208
+ return (
209
+ None,
210
+ None,
211
+ "<h3>Results will be displayed here after processing.</h3>",
212
+ "",
213
+ "",
214
+ None,
215
+ "",
216
+ '<div class="page-info">No file loaded</div>',
217
+ get_initial_state()
218
+ )
219
+
220
+ @click.command()
221
+ def main():
222
+ """
223
+ Sets up and launches the Gradio user interface for the Logics-Parsing app.
224
+ """
225
+ css = """
226
+ .main-container { max-width: 1400px; margin: 0 auto; }
227
+ .header-text { text-align: center; color: #2c3e50; margin-bottom: 20px; }
228
+ .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
229
+ .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
230
+ .page-info { text-align: center; padding: 8px 16px; border-radius: 20px; font-weight: bold; margin: 10px 0; }
231
+ """
232
+ with gr.Blocks(theme="bethecloud/storj_theme", css=css, title="Logics-Parsing Demo") as demo:
233
+ app_state = gr.State(value=get_initial_state())
234
+
235
+ gr.HTML("""
236
+ <div class="header-text">
237
+ <h1>📄 Logics-Parsing: Document Parsing VLM</h1>
238
+ <p style="font-size: 1.1em; color: #6b7280;">An advanced Vision Language Model to parse documents and images into clean HTML and Markdown.</p>
239
+ <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
240
+ <a href="https://huggingface.co/Logics-MLLM/Logics-Parsing" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">🤗 Model Page</a>
241
+ <a href="https://github.com/alibaba/Logics-Parsing" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">💻 GitHub</a>
242
+ <a href="https://arxiv.org/abs/2509.19760" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">📝 Arxiv Paper</a>
243
+ </div>
244
+ </div>
245
+ """)
246
+
247
+ with gr.Row(elem_classes=["main-container"]):
248
+ with gr.Column(scale=1):
249
+ model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0"], label="Select Model⚡️", value="Logics-Parsing")
250
+ file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
251
+ image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=280)
252
+
253
+ with gr.Row():
254
+ prev_page_btn = gr.Button("◀ Previous", size="md")
255
+ page_info = gr.HTML('<div class="page-info">No file loaded</div>')
256
+ next_page_btn = gr.Button("Next ▶", size="md")
257
+
258
+ example_root = "examples"
259
+ if os.path.exists(example_root) and os.path.isdir(example_root):
260
+ example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
261
+ if example_files:
262
+ with gr.Accordion("Open Examples⚙️", open=False):
263
+ gr.Examples(examples=example_files, inputs=file_input, examples_per_page=10)
264
+
265
+ with gr.Accordion("Download Details🕧", open=False):
266
+ output_file = gr.File(label='Download Markdown Result', interactive=False)
267
+ cost_time = gr.Text(label='Time Cost', interactive=False)
268
+
269
+ process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
270
+ clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
271
+
272
+ with gr.Column(scale=2):
273
+ with gr.Tabs():
274
+ with gr.Tab("Markdown Rendering"):
275
+ mmd_html = gr.TextArea(lines=27, label='Markdown Rendering', show_copy_button=True, interactive=True)
276
+ with gr.Tab("Markdown Source"):
277
+ mmd = gr.TextArea(lines=27, show_copy_button=True, label="Markdown Source", interactive=True)
278
+ with gr.Tab("Generated HTML"):
279
+ raw_html = gr.TextArea(lines=27, show_copy_button=True, label="Generated HTML")
280
+
281
+ # --- Event Handlers ---
282
+ file_input.change(
283
+ fn=load_and_preview_file,
284
+ inputs=file_input,
285
+ outputs=[image_preview, page_info, app_state],
286
+ show_progress="full")
287
+
288
+ process_btn.click(
289
+ fn=process_all_pages,
290
+ inputs=[app_state, model_choice],
291
+ outputs=[mmd_html, mmd, raw_html,
292
+ output_file, cost_time, app_state],
293
+ concurrency_limit=15,
294
+ show_progress="full")
295
+
296
+ prev_page_btn.click(
297
+ fn=lambda s: navigate_page("prev", s),
298
+ inputs=app_state, outputs=[image_preview,
299
+ page_info, mmd_html, mmd, raw_html, app_state])
300
+
301
+ next_page_btn.click(
302
+ fn=lambda s: navigate_page("next", s),
303
+ inputs=app_state, outputs=[image_preview,
304
+ page_info, mmd_html, mmd, raw_html, app_state])
305
+
306
+ clear_btn.click(
307
+ fn=clear_all,
308
+ outputs=[file_input, image_preview, mmd_html, mmd, raw_html,
309
+ output_file, cost_time, page_info, app_state])
310
+
311
+ demo.queue().launch(debug=True, show_error=True)
312
+
313
+ if __name__ == '__main__':
314
+ if not os.path.exists("examples"):
315
+ os.makedirs("examples")
316
+ logger.info("Created 'examples' directory. Please add some sample PDF/image files there.")
317
+ main()
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pip>=23.0.0
requirements.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/accelerate.git
2
+ git+https://github.com/huggingface/peft.git
3
+ transformers-stream-generator
4
+ gradio_pdf==0.0.22
5
+ huggingface_hub
6
+ albumentations
7
+ beautifulsoup4
8
+ qwen-vl-utils
9
+ pyvips-binary
10
+ sentencepiece
11
+ opencv-python
12
+ transformers
13
+ docling-core
14
+ python-docx
15
+ torchvision
16
+ matplotlib
17
+ pdf2image
18
+ num2words
19
+ reportlab
20
+ html2text
21
+ xformers
22
+ markdown
23
+ requests
24
+ pymupdf
25
+ loguru
26
+ hf_xet
27
+ spaces
28
+ pyvips
29
+ pillow
30
+ gradio
31
+ einops
32
+ httpx
33
+ click
34
+ torch
35
+ oss2
36
+ fpdf
37
+ timm
38
+ av