import gradio as gr from pdf2image import convert_from_path import base64 from openai import OpenAI from io import BytesIO import concurrent.futures import json import os import zipfile import tempfile import shutil import dotenv import os from mistralai import Mistral dotenv.load_dotenv() if not os.environ.get("OPENAI_API_KEY"): raise ValueError("OPENAI_API_KEY is not set") client = OpenAI() def encode_pil_image(pil_image): buffered = BytesIO() pil_image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def encode_pdf(pdf_path): """Encode the pdf to base64.""" try: with open(pdf_path, "rb") as pdf_file: return base64.b64encode(pdf_file.read()).decode('utf-8') except FileNotFoundError: print(f"Error: The file {pdf_path} was not found.") return None except Exception as e: print(f"Error: {e}") return None def extract_markdown_from_image(image, idx): base64_image = encode_pil_image(image) try: completion = client.chat.completions.create( model="o4-mini", messages=[ { "role": "user", "content": [ { "type": "text", "text": "Extract the text from this page and return it as markdown, with the best possible quality and accuracy." }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", "detail": "high" }, }, ], } ], ) return idx, completion.choices[0].message.content except Exception as e: print(e) return idx, f"Error processing page {idx}: {e}" def pdf_to_outputs_with_progress(pdf_file, model_choice="openai", output_type="zip", page_from=1, page_to=None, progress=gr.Progress(track_tqdm=True)): # Save uploaded file to a temp path if needed if hasattr(pdf_file, "name"): pdf_path = pdf_file.name else: # Gradio may pass a str path or a file object pdf_path = pdf_file if model_choice == "openai": # OpenAI processing images = convert_from_path(pdf_path) num_pages = len(images) # Clamp page_from and page_to page_from = max(1, min(page_from, num_pages)) page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages)) selected_images = images[page_from-1:page_to] selected_indices = list(range(page_from-1, page_to)) results = [None] * (page_to - page_from + 1) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for i, img in enumerate(selected_images): futures.append(executor.submit(extract_markdown_from_image, img, i)) for idx, future in enumerate(concurrent.futures.as_completed(futures)): idx_result, content = future.result() results[idx_result] = content.replace("```markdown", "").replace("```", "") progress((idx + 1) / len(selected_images), desc=f"Processing page {selected_indices[idx_result] + 1} of {num_pages}") output_json = [ {"page": selected_indices[idx] + 1, "markdown": content} for idx, content in enumerate(results) ] else: # Mistral processing if not os.environ.get("MISTRAL_API_KEY"): raise ValueError("MISTRAL_API_KEY is not set") mistral_client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY")) base64_pdf = encode_pdf(pdf_path) progress(0.1, desc="Sending PDF to Mistral API") ocr_response = mistral_client.ocr.process( model="mistral-ocr-latest", document={ "type": "document_url", "document_url": f"data:application/pdf;base64,{base64_pdf}" } ) num_pages = len(ocr_response.pages) page_from = max(1, min(page_from, num_pages)) page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages)) selected_pages = ocr_response.pages[page_from-1:page_to] selected_indices = list(range(page_from-1, page_to)) progress(0.5, desc="Processing Mistral API response") results = [] for page in selected_pages: results.append(page.markdown) output_json = [ {"page": selected_indices[idx] + 1, "markdown": content} for idx, content in enumerate(results) ] progress(0.8, desc="Preparing output files") # Now handle output_type: "zip", "markdown", "json" temp_dir = tempfile.mkdtemp() output_paths = {} if output_type == "zip": md_folder = os.path.join(temp_dir, "pages") os.makedirs(md_folder, exist_ok=True) # Write each page as a separate .md file for idx, content in enumerate(results): md_path = os.path.join(md_folder, f"page_{selected_indices[idx]+1}.md") with open(md_path, "w", encoding="utf-8") as f: f.write(content.strip()) # Write the JSON file output_json_path = os.path.join(temp_dir, "ocr_output.json") with open(output_json_path, "w", encoding="utf-8") as f: json.dump(output_json, f, ensure_ascii=False, indent=2) # Create a zip file containing the folder with md files and the json zip_path = os.path.join(temp_dir, "ocr_output.zip") with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: # Add the JSON file zipf.write(output_json_path, arcname="ocr_output.json") # Add the md files folder and its contents for root, dirs, files in os.walk(md_folder): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, temp_dir) zipf.write(file_path, arcname=arcname) output_paths["zip"] = zip_path if output_type == "markdown": # Write all markdown into one file md_path = os.path.join(temp_dir, "ocr_output.md") with open(md_path, "w", encoding="utf-8") as f: for idx, content in enumerate(results): f.write(f"\n\n# Page {selected_indices[idx]+1}\n\n") f.write(content.strip()) f.write("\n") output_paths["markdown"] = md_path if output_type == "json": # Write the JSON file output_json_path = os.path.join(temp_dir, "ocr_output.json") with open(output_json_path, "w", encoding="utf-8") as f: json.dump(output_json, f, ensure_ascii=False, indent=2) output_paths["json"] = output_json_path return output_paths, temp_dir with gr.Blocks() as demo: gr.Markdown( "# PDF to Markdown & JSON OCR (OpenAI Vision)\n" "Upload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.\n\n" "You can also choose to download all results as a single markdown file or a single JSON file.\n\n" "**You can also select a range of pages to process.**" ) pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai") # Add page range selectors page_from = gr.Number(label="Page From", value=1, precision=0, minimum=1) page_to = gr.Number(label="Page To (leave blank for last page)", value=None, precision=0, minimum=1) output_type = gr.Radio( ["zip", "markdown", "json"], label="Output Format", value="zip", info="Choose output: ZIP (md files + JSON), single Markdown file, or single JSON file" ) zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False, visible=True) md_output = gr.File(label="Download Markdown", interactive=False, visible=False) json_output = gr.File(label="Download JSON", interactive=False, visible=False) def process_and_return_outputs(pdf_file, model_choice, output_type, page_from, page_to, progress=gr.Progress(track_tqdm=True)): # If page_to is None or blank, treat as last page page_from = int(page_from) if page_from is not None else 1 page_to_val = int(page_to) if page_to not in (None, "") else None output_paths, temp_dir = pdf_to_outputs_with_progress(pdf_file, model_choice, output_type, page_from, page_to_val, progress=progress) # Return only the selected output, others as None zip_path = output_paths.get("zip") if output_type == "zip" else None md_path = output_paths.get("markdown") if output_type == "markdown" else None json_path = output_paths.get("json") if output_type == "json" else None return zip_path, md_path, json_path process_btn = gr.Button("Convert PDF") process_btn.click( process_and_return_outputs, inputs=[pdf_input, model_choice, output_type, page_from, page_to], outputs=[zip_output, md_output, json_output] ) # Dynamically show/hide outputs based on output_type def update_output_visibility(output_type): return ( gr.update(visible=(output_type == "zip")), gr.update(visible=(output_type == "markdown")), gr.update(visible=(output_type == "json")), ) output_type.change( update_output_visibility, inputs=[output_type], outputs=[zip_output, md_output, json_output] ) demo.launch()