Spaces:

Illia56
/

PDF_OCR_OPENAI

Running

App Files Files Community

PDF_OCR_OPENAI / app.py

Illia56

Update app.py

da3fc7c verified 6 months ago

raw

history blame

9.88 kB

	import gradio as gr
	from pdf2image import convert_from_path
	import base64
	from openai import OpenAI
	from io import BytesIO
	import concurrent.futures
	import json
	import os
	import zipfile
	import tempfile
	import shutil
	import dotenv
	import os
	from mistralai import Mistral

	dotenv.load_dotenv()

	if not os.environ.get("OPENAI_API_KEY"):
	raise ValueError("OPENAI_API_KEY is not set")

	client = OpenAI()

	def encode_pil_image(pil_image):
	buffered = BytesIO()
	pil_image.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	def encode_pdf(pdf_path):
	"""Encode the pdf to base64."""
	try:
	with open(pdf_path, "rb") as pdf_file:
	return base64.b64encode(pdf_file.read()).decode('utf-8')
	except FileNotFoundError:
	print(f"Error: The file {pdf_path} was not found.")
	return None
	except Exception as e:
	print(f"Error: {e}")
	return None

	def extract_markdown_from_image(image, idx):
	base64_image = encode_pil_image(image)
	try:
	completion = client.chat.completions.create(
	model="o4-mini",
	messages=[
	{
	"role": "user",
	"content": [
	{ "type": "text", "text": "Extract the text from this page and return it as markdown, with the best possible quality and accuracy." },
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high"
	},
	},
	],
	}
	],
	)
	return idx, completion.choices[0].message.content
	except Exception as e:
	print(e)
	return idx, f"Error processing page {idx}: {e}"

	def pdf_to_outputs_with_progress(pdf_file, model_choice="openai", output_type="zip", page_from=1, page_to=None, progress=gr.Progress(track_tqdm=True)):
	# Save uploaded file to a temp path if needed
	if hasattr(pdf_file, "name"):
	pdf_path = pdf_file.name
	else:
	# Gradio may pass a str path or a file object
	pdf_path = pdf_file

	if model_choice == "openai":
	# OpenAI processing
	images = convert_from_path(pdf_path)
	num_pages = len(images)
	# Clamp page_from and page_to
	page_from = max(1, min(page_from, num_pages))
	page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
	selected_images = images[page_from-1:page_to]
	selected_indices = list(range(page_from-1, page_to))
	results = [None] * (page_to - page_from + 1)

	with concurrent.futures.ThreadPoolExecutor() as executor:
	futures = []
	for i, img in enumerate(selected_images):
	futures.append(executor.submit(extract_markdown_from_image, img, i))
	for idx, future in enumerate(concurrent.futures.as_completed(futures)):
	idx_result, content = future.result()
	results[idx_result] = content.replace("```markdown", "").replace("```", "")
	progress((idx + 1) / len(selected_images), desc=f"Processing page {selected_indices[idx_result] + 1} of {num_pages}")

	output_json = [
	{"page": selected_indices[idx] + 1, "markdown": content}
	for idx, content in enumerate(results)
	]
	else:
	# Mistral processing
	if not os.environ.get("MISTRAL_API_KEY"):
	raise ValueError("MISTRAL_API_KEY is not set")

	mistral_client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
	base64_pdf = encode_pdf(pdf_path)

	progress(0.1, desc="Sending PDF to Mistral API")
	ocr_response = mistral_client.ocr.process(
	model="mistral-ocr-latest",
	document={
	"type": "document_url",
	"document_url": f"data:application/pdf;base64,{base64_pdf}"
	}
	)
	num_pages = len(ocr_response.pages)
	page_from = max(1, min(page_from, num_pages))
	page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
	selected_pages = ocr_response.pages[page_from-1:page_to]
	selected_indices = list(range(page_from-1, page_to))

	progress(0.5, desc="Processing Mistral API response")
	results = []
	for page in selected_pages:
	results.append(page.markdown)

	output_json = [
	{"page": selected_indices[idx] + 1, "markdown": content}
	for idx, content in enumerate(results)
	]
	progress(0.8, desc="Preparing output files")

	# Now handle output_type: "zip", "markdown", "json"
	temp_dir = tempfile.mkdtemp()
	output_paths = {}

	if output_type == "zip":
	md_folder = os.path.join(temp_dir, "pages")
	os.makedirs(md_folder, exist_ok=True)

	# Write each page as a separate .md file
	for idx, content in enumerate(results):
	md_path = os.path.join(md_folder, f"page_{selected_indices[idx]+1}.md")
	with open(md_path, "w", encoding="utf-8") as f:
	f.write(content.strip())

	# Write the JSON file
	output_json_path = os.path.join(temp_dir, "ocr_output.json")
	with open(output_json_path, "w", encoding="utf-8") as f:
	json.dump(output_json, f, ensure_ascii=False, indent=2)

	# Create a zip file containing the folder with md files and the json
	zip_path = os.path.join(temp_dir, "ocr_output.zip")
	with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
	# Add the JSON file
	zipf.write(output_json_path, arcname="ocr_output.json")
	# Add the md files folder and its contents
	for root, dirs, files in os.walk(md_folder):
	for file in files:
	file_path = os.path.join(root, file)
	arcname = os.path.relpath(file_path, temp_dir)
	zipf.write(file_path, arcname=arcname)
	output_paths["zip"] = zip_path

	if output_type == "markdown":
	# Write all markdown into one file
	md_path = os.path.join(temp_dir, "ocr_output.md")
	with open(md_path, "w", encoding="utf-8") as f:
	for idx, content in enumerate(results):
	f.write(f"\n\n# Page {selected_indices[idx]+1}\n\n")
	f.write(content.strip())
	f.write("\n")
	output_paths["markdown"] = md_path

	if output_type == "json":
	# Write the JSON file
	output_json_path = os.path.join(temp_dir, "ocr_output.json")
	with open(output_json_path, "w", encoding="utf-8") as f:
	json.dump(output_json, f, ensure_ascii=False, indent=2)
	output_paths["json"] = output_json_path

	return output_paths, temp_dir

	with gr.Blocks() as demo:
	gr.Markdown(
	"# PDF to Markdown & JSON OCR (OpenAI Vision)\n"
	"Upload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.\n\n"
	"You can also choose to download all results as a single markdown file or a single JSON file.\n\n"
	"You can also select a range of pages to process."
	)
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai")
	# Add page range selectors
	page_from = gr.Number(label="Page From", value=1, precision=0, minimum=1)
	page_to = gr.Number(label="Page To (leave blank for last page)", value=None, precision=0, minimum=1)
	output_type = gr.Radio(
	["zip", "markdown", "json"],
	label="Output Format",
	value="zip",
	info="Choose output: ZIP (md files + JSON), single Markdown file, or single JSON file"
	)
	zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False, visible=True)
	md_output = gr.File(label="Download Markdown", interactive=False, visible=False)
	json_output = gr.File(label="Download JSON", interactive=False, visible=False)

	def process_and_return_outputs(pdf_file, model_choice, output_type, page_from, page_to, progress=gr.Progress(track_tqdm=True)):
	# If page_to is None or blank, treat as last page
	page_from = int(page_from) if page_from is not None else 1
	page_to_val = int(page_to) if page_to not in (None, "") else None
	output_paths, temp_dir = pdf_to_outputs_with_progress(pdf_file, model_choice, output_type, page_from, page_to_val, progress=progress)
	# Return only the selected output, others as None
	zip_path = output_paths.get("zip") if output_type == "zip" else None
	md_path = output_paths.get("markdown") if output_type == "markdown" else None
	json_path = output_paths.get("json") if output_type == "json" else None
	return zip_path, md_path, json_path

	process_btn = gr.Button("Convert PDF")
	process_btn.click(
	process_and_return_outputs,
	inputs=[pdf_input, model_choice, output_type, page_from, page_to],
	outputs=[zip_output, md_output, json_output]
	)

	# Dynamically show/hide outputs based on output_type
	def update_output_visibility(output_type):
	return (
	gr.update(visible=(output_type == "zip")),
	gr.update(visible=(output_type == "markdown")),
	gr.update(visible=(output_type == "json")),
	)

	output_type.change(
	update_output_visibility,
	inputs=[output_type],
	outputs=[zip_output, md_output, json_output]
	)

	demo.launch()