Spaces:

Illia56
/

PDF_OCR_OPENAI

Running

App Files Files Community

Illia56 commited on May 2

Commit

da3fc7c

verified ·

1 Parent(s): 13c353c

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -46

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ def extract_markdown_from_image(image, idx):
         print(e)
         return idx, f"Error processing page {idx}: {e}"
-def pdf_to_json_and_md_zip_with_progress(pdf_file, model_choice="openai", progress=gr.Progress(track_tqdm=True)):
     # Save uploaded file to a temp path if needed
     if hasattr(pdf_file, "name"):
         pdf_path = pdf_file.name
@@ -75,19 +75,24 @@ def pdf_to_json_and_md_zip_with_progress(pdf_file, model_choice="openai", progre
         # OpenAI processing
         images = convert_from_path(pdf_path)
         num_pages = len(images)
-        results = [None] * num_pages
         with concurrent.futures.ThreadPoolExecutor() as executor:
             futures = []
-            for i in range(num_pages):
-                futures.append(executor.submit(extract_markdown_from_image, images[i], i))
             for idx, future in enumerate(concurrent.futures.as_completed(futures)):
                 idx_result, content = future.result()
                 results[idx_result] = content.replace("```markdown", "").replace("```", "")
-                progress((idx + 1) / num_pages, desc=f"Processing page {idx_result + 1} of {num_pages}")
         output_json = [
-            {"page": idx + 1, "markdown": content}
             for idx, content in enumerate(results)
         ]
     else:
@@ -106,63 +111,126 @@ def pdf_to_json_and_md_zip_with_progress(pdf_file, model_choice="openai", progre
                 "document_url": f"data:application/pdf;base64,{base64_pdf}"
             }
         )
         progress(0.5, desc="Processing Mistral API response")
         results = []
-        for page in ocr_response.pages:
             results.append(page.markdown)
         output_json = [
-            {"page": idx + 1, "markdown": content}
             for idx, content in enumerate(results)
         ]
         progress(0.8, desc="Preparing output files")
-    # Create a temporary directory to store md files and json
     temp_dir = tempfile.mkdtemp()
-    md_folder = os.path.join(temp_dir, "pages")
-    os.makedirs(md_folder, exist_ok=True)
-    # Write each page as a separate .md file
-    for idx, content in enumerate(results):
-        md_path = os.path.join(md_folder, f"page_{idx+1}.md")
         with open(md_path, "w", encoding="utf-8") as f:
-            f.write(content.strip())
-    # Write the JSON file
-    output_json_path = os.path.join(temp_dir, "ocr_output.json")
-    with open(output_json_path, "w", encoding="utf-8") as f:
-        json.dump(output_json, f, ensure_ascii=False, indent=2)
-    # Create a zip file containing the folder with md files and the json
-    zip_path = os.path.join(temp_dir, "ocr_output.zip")
-    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
-        # Add the JSON file
-        zipf.write(output_json_path, arcname="ocr_output.json")
-        # Add the md files folder and its contents
-        for root, dirs, files in os.walk(md_folder):
-            for file in files:
-                file_path = os.path.join(root, file)
-                arcname = os.path.relpath(file_path, temp_dir)
-                zipf.write(file_path, arcname=arcname)
-    return zip_path
 with gr.Blocks() as demo:
-    gr.Markdown("# PDF to Markdown & JSON OCR (OpenAI Vision)\nUpload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai")
-    zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False)
-    def process_and_return_zip(pdf_file, model_choice, progress=gr.Progress(track_tqdm=True)):
-        zip_path = pdf_to_json_and_md_zip_with_progress(pdf_file, model_choice, progress=progress)
-        return zip_path
-    process_btn = gr.Button("Convert PDF to ZIP")
-    process_btn.click(
-        process_and_return_zip,
-        inputs=[pdf_input, model_choice],
-        outputs=[zip_output]
     )
 demo.launch()

         print(e)
         return idx, f"Error processing page {idx}: {e}"
+def pdf_to_outputs_with_progress(pdf_file, model_choice="openai", output_type="zip", page_from=1, page_to=None, progress=gr.Progress(track_tqdm=True)):
     # Save uploaded file to a temp path if needed
     if hasattr(pdf_file, "name"):
         pdf_path = pdf_file.name
         # OpenAI processing
         images = convert_from_path(pdf_path)
         num_pages = len(images)
+        # Clamp page_from and page_to
+        page_from = max(1, min(page_from, num_pages))
+        page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
+        selected_images = images[page_from-1:page_to]
+        selected_indices = list(range(page_from-1, page_to))
+        results = [None] * (page_to - page_from + 1)
         with concurrent.futures.ThreadPoolExecutor() as executor:
             futures = []
+            for i, img in enumerate(selected_images):
+                futures.append(executor.submit(extract_markdown_from_image, img, i))
             for idx, future in enumerate(concurrent.futures.as_completed(futures)):
                 idx_result, content = future.result()
                 results[idx_result] = content.replace("```markdown", "").replace("```", "")
+                progress((idx + 1) / len(selected_images), desc=f"Processing page {selected_indices[idx_result] + 1} of {num_pages}")
         output_json = [
+            {"page": selected_indices[idx] + 1, "markdown": content}
             for idx, content in enumerate(results)
         ]
     else:
                 "document_url": f"data:application/pdf;base64,{base64_pdf}"
             }
         )
+        num_pages = len(ocr_response.pages)
+        page_from = max(1, min(page_from, num_pages))
+        page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
+        selected_pages = ocr_response.pages[page_from-1:page_to]
+        selected_indices = list(range(page_from-1, page_to))
         progress(0.5, desc="Processing Mistral API response")
         results = []
+        for page in selected_pages:
             results.append(page.markdown)
         output_json = [
+            {"page": selected_indices[idx] + 1, "markdown": content}
             for idx, content in enumerate(results)
         ]
         progress(0.8, desc="Preparing output files")
+    # Now handle output_type: "zip", "markdown", "json"
     temp_dir = tempfile.mkdtemp()
+    output_paths = {}
+    if output_type == "zip":
+        md_folder = os.path.join(temp_dir, "pages")
+        os.makedirs(md_folder, exist_ok=True)
+        # Write each page as a separate .md file
+        for idx, content in enumerate(results):
+            md_path = os.path.join(md_folder, f"page_{selected_indices[idx]+1}.md")
+            with open(md_path, "w", encoding="utf-8") as f:
+                f.write(content.strip())
+        # Write the JSON file
+        output_json_path = os.path.join(temp_dir, "ocr_output.json")
+        with open(output_json_path, "w", encoding="utf-8") as f:
+            json.dump(output_json, f, ensure_ascii=False, indent=2)
+        # Create a zip file containing the folder with md files and the json
+        zip_path = os.path.join(temp_dir, "ocr_output.zip")
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            # Add the JSON file
+            zipf.write(output_json_path, arcname="ocr_output.json")
+            # Add the md files folder and its contents
+            for root, dirs, files in os.walk(md_folder):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.relpath(file_path, temp_dir)
+                    zipf.write(file_path, arcname=arcname)
+        output_paths["zip"] = zip_path
+    if output_type == "markdown":
+        # Write all markdown into one file
+        md_path = os.path.join(temp_dir, "ocr_output.md")
         with open(md_path, "w", encoding="utf-8") as f:
+            for idx, content in enumerate(results):
+                f.write(f"\n\n# Page {selected_indices[idx]+1}\n\n")
+                f.write(content.strip())
+                f.write("\n")
+        output_paths["markdown"] = md_path
+    if output_type == "json":
+        # Write the JSON file
+        output_json_path = os.path.join(temp_dir, "ocr_output.json")
+        with open(output_json_path, "w", encoding="utf-8") as f:
+            json.dump(output_json, f, ensure_ascii=False, indent=2)
+        output_paths["json"] = output_json_path
+    return output_paths, temp_dir
 with gr.Blocks() as demo:
+    gr.Markdown(
+        "# PDF to Markdown & JSON OCR (OpenAI Vision)\n"
+        "Upload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.\n\n"
+        "You can also choose to download all results as a single markdown file or a single JSON file.\n\n"
+        "**You can also select a range of pages to process.**"
+    )
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai")
+    # Add page range selectors
+    page_from = gr.Number(label="Page From", value=1, precision=0, minimum=1)
+    page_to = gr.Number(label="Page To (leave blank for last page)", value=None, precision=0, minimum=1)
+    output_type = gr.Radio(
+        ["zip", "markdown", "json"],
+        label="Output Format",
+        value="zip",
+        info="Choose output: ZIP (md files + JSON), single Markdown file, or single JSON file"
+    )
+    zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False, visible=True)
+    md_output = gr.File(label="Download Markdown", interactive=False, visible=False)
+    json_output = gr.File(label="Download JSON", interactive=False, visible=False)
+    def process_and_return_outputs(pdf_file, model_choice, output_type, page_from, page_to, progress=gr.Progress(track_tqdm=True)):
+        # If page_to is None or blank, treat as last page
+        page_from = int(page_from) if page_from is not None else 1
+        page_to_val = int(page_to) if page_to not in (None, "") else None
+        output_paths, temp_dir = pdf_to_outputs_with_progress(pdf_file, model_choice, output_type, page_from, page_to_val, progress=progress)
+        # Return only the selected output, others as None
+        zip_path = output_paths.get("zip") if output_type == "zip" else None
+        md_path = output_paths.get("markdown") if output_type == "markdown" else None
+        json_path = output_paths.get("json") if output_type == "json" else None
+        return zip_path, md_path, json_path
+    process_btn = gr.Button("Convert PDF")
+    process_btn.click(
+        process_and_return_outputs,
+        inputs=[pdf_input, model_choice, output_type, page_from, page_to],
+        outputs=[zip_output, md_output, json_output]
+    )
+    # Dynamically show/hide outputs based on output_type
+    def update_output_visibility(output_type):
+        return (
+            gr.update(visible=(output_type == "zip")),
+            gr.update(visible=(output_type == "markdown")),
+            gr.update(visible=(output_type == "json")),
+        )
+    output_type.change(
+        update_output_visibility,
+        inputs=[output_type],
+        outputs=[zip_output, md_output, json_output]
     )
 demo.launch()