Spaces:
Running
Running
| import gradio as gr | |
| from pdf2image import convert_from_path | |
| import base64 | |
| from openai import OpenAI | |
| from io import BytesIO | |
| import concurrent.futures | |
| import json | |
| import os | |
| import zipfile | |
| import tempfile | |
| import shutil | |
| import dotenv | |
| import os | |
| from mistralai import Mistral | |
| dotenv.load_dotenv() | |
| if not os.environ.get("OPENAI_API_KEY"): | |
| raise ValueError("OPENAI_API_KEY is not set") | |
| client = OpenAI() | |
| def encode_pil_image(pil_image): | |
| buffered = BytesIO() | |
| pil_image.save(buffered, format="JPEG") | |
| return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| def encode_pdf(pdf_path): | |
| """Encode the pdf to base64.""" | |
| try: | |
| with open(pdf_path, "rb") as pdf_file: | |
| return base64.b64encode(pdf_file.read()).decode('utf-8') | |
| except FileNotFoundError: | |
| print(f"Error: The file {pdf_path} was not found.") | |
| return None | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |
| def extract_markdown_from_image(image, idx): | |
| base64_image = encode_pil_image(image) | |
| try: | |
| completion = client.chat.completions.create( | |
| model="o4-mini", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { "type": "text", "text": "Extract the text from this page and return it as markdown, with the best possible quality and accuracy." }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{base64_image}", | |
| "detail": "high" | |
| }, | |
| }, | |
| ], | |
| } | |
| ], | |
| ) | |
| return idx, completion.choices[0].message.content | |
| except Exception as e: | |
| print(e) | |
| return idx, f"Error processing page {idx}: {e}" | |
| def pdf_to_outputs_with_progress(pdf_file, model_choice="openai", output_type="zip", page_from=1, page_to=None, progress=gr.Progress(track_tqdm=True)): | |
| # Save uploaded file to a temp path if needed | |
| if hasattr(pdf_file, "name"): | |
| pdf_path = pdf_file.name | |
| else: | |
| # Gradio may pass a str path or a file object | |
| pdf_path = pdf_file | |
| if model_choice == "openai": | |
| # OpenAI processing | |
| images = convert_from_path(pdf_path) | |
| num_pages = len(images) | |
| # Clamp page_from and page_to | |
| page_from = max(1, min(page_from, num_pages)) | |
| page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages)) | |
| selected_images = images[page_from-1:page_to] | |
| selected_indices = list(range(page_from-1, page_to)) | |
| results = [None] * (page_to - page_from + 1) | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| futures = [] | |
| for i, img in enumerate(selected_images): | |
| futures.append(executor.submit(extract_markdown_from_image, img, i)) | |
| for idx, future in enumerate(concurrent.futures.as_completed(futures)): | |
| idx_result, content = future.result() | |
| results[idx_result] = content.replace("```markdown", "").replace("```", "") | |
| progress((idx + 1) / len(selected_images), desc=f"Processing page {selected_indices[idx_result] + 1} of {num_pages}") | |
| output_json = [ | |
| {"page": selected_indices[idx] + 1, "markdown": content} | |
| for idx, content in enumerate(results) | |
| ] | |
| else: | |
| # Mistral processing | |
| if not os.environ.get("MISTRAL_API_KEY"): | |
| raise ValueError("MISTRAL_API_KEY is not set") | |
| mistral_client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY")) | |
| base64_pdf = encode_pdf(pdf_path) | |
| progress(0.1, desc="Sending PDF to Mistral API") | |
| ocr_response = mistral_client.ocr.process( | |
| model="mistral-ocr-latest", | |
| document={ | |
| "type": "document_url", | |
| "document_url": f"data:application/pdf;base64,{base64_pdf}" | |
| } | |
| ) | |
| num_pages = len(ocr_response.pages) | |
| page_from = max(1, min(page_from, num_pages)) | |
| page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages)) | |
| selected_pages = ocr_response.pages[page_from-1:page_to] | |
| selected_indices = list(range(page_from-1, page_to)) | |
| progress(0.5, desc="Processing Mistral API response") | |
| results = [] | |
| for page in selected_pages: | |
| results.append(page.markdown) | |
| output_json = [ | |
| {"page": selected_indices[idx] + 1, "markdown": content} | |
| for idx, content in enumerate(results) | |
| ] | |
| progress(0.8, desc="Preparing output files") | |
| # Now handle output_type: "zip", "markdown", "json" | |
| temp_dir = tempfile.mkdtemp() | |
| output_paths = {} | |
| if output_type == "zip": | |
| md_folder = os.path.join(temp_dir, "pages") | |
| os.makedirs(md_folder, exist_ok=True) | |
| # Write each page as a separate .md file | |
| for idx, content in enumerate(results): | |
| md_path = os.path.join(md_folder, f"page_{selected_indices[idx]+1}.md") | |
| with open(md_path, "w", encoding="utf-8") as f: | |
| f.write(content.strip()) | |
| # Write the JSON file | |
| output_json_path = os.path.join(temp_dir, "ocr_output.json") | |
| with open(output_json_path, "w", encoding="utf-8") as f: | |
| json.dump(output_json, f, ensure_ascii=False, indent=2) | |
| # Create a zip file containing the folder with md files and the json | |
| zip_path = os.path.join(temp_dir, "ocr_output.zip") | |
| with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: | |
| # Add the JSON file | |
| zipf.write(output_json_path, arcname="ocr_output.json") | |
| # Add the md files folder and its contents | |
| for root, dirs, files in os.walk(md_folder): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| arcname = os.path.relpath(file_path, temp_dir) | |
| zipf.write(file_path, arcname=arcname) | |
| output_paths["zip"] = zip_path | |
| if output_type == "markdown": | |
| # Write all markdown into one file | |
| md_path = os.path.join(temp_dir, "ocr_output.md") | |
| with open(md_path, "w", encoding="utf-8") as f: | |
| for idx, content in enumerate(results): | |
| f.write(f"\n\n# Page {selected_indices[idx]+1}\n\n") | |
| f.write(content.strip()) | |
| f.write("\n") | |
| output_paths["markdown"] = md_path | |
| if output_type == "json": | |
| # Write the JSON file | |
| output_json_path = os.path.join(temp_dir, "ocr_output.json") | |
| with open(output_json_path, "w", encoding="utf-8") as f: | |
| json.dump(output_json, f, ensure_ascii=False, indent=2) | |
| output_paths["json"] = output_json_path | |
| return output_paths, temp_dir | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| "# PDF to Markdown & JSON OCR (OpenAI Vision)\n" | |
| "Upload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.\n\n" | |
| "You can also choose to download all results as a single markdown file or a single JSON file.\n\n" | |
| "**You can also select a range of pages to process.**" | |
| ) | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai") | |
| # Add page range selectors | |
| page_from = gr.Number(label="Page From", value=1, precision=0, minimum=1) | |
| page_to = gr.Number(label="Page To (leave blank for last page)", value=None, precision=0, minimum=1) | |
| output_type = gr.Radio( | |
| ["zip", "markdown", "json"], | |
| label="Output Format", | |
| value="zip", | |
| info="Choose output: ZIP (md files + JSON), single Markdown file, or single JSON file" | |
| ) | |
| zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False, visible=True) | |
| md_output = gr.File(label="Download Markdown", interactive=False, visible=False) | |
| json_output = gr.File(label="Download JSON", interactive=False, visible=False) | |
| def process_and_return_outputs(pdf_file, model_choice, output_type, page_from, page_to, progress=gr.Progress(track_tqdm=True)): | |
| # If page_to is None or blank, treat as last page | |
| page_from = int(page_from) if page_from is not None else 1 | |
| page_to_val = int(page_to) if page_to not in (None, "") else None | |
| output_paths, temp_dir = pdf_to_outputs_with_progress(pdf_file, model_choice, output_type, page_from, page_to_val, progress=progress) | |
| # Return only the selected output, others as None | |
| zip_path = output_paths.get("zip") if output_type == "zip" else None | |
| md_path = output_paths.get("markdown") if output_type == "markdown" else None | |
| json_path = output_paths.get("json") if output_type == "json" else None | |
| return zip_path, md_path, json_path | |
| process_btn = gr.Button("Convert PDF") | |
| process_btn.click( | |
| process_and_return_outputs, | |
| inputs=[pdf_input, model_choice, output_type, page_from, page_to], | |
| outputs=[zip_output, md_output, json_output] | |
| ) | |
| # Dynamically show/hide outputs based on output_type | |
| def update_output_visibility(output_type): | |
| return ( | |
| gr.update(visible=(output_type == "zip")), | |
| gr.update(visible=(output_type == "markdown")), | |
| gr.update(visible=(output_type == "json")), | |
| ) | |
| output_type.change( | |
| update_output_visibility, | |
| inputs=[output_type], | |
| outputs=[zip_output, md_output, json_output] | |
| ) | |
| demo.launch() |