import gradio as gr
from pdf2image import convert_from_path
import base64
from openai import OpenAI
from io import BytesIO
import concurrent.futures
import json
import os
import zipfile
import tempfile
import shutil
import dotenv
import os
from mistralai import Mistral

dotenv.load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is not set")

client = OpenAI()

def encode_pil_image(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def extract_markdown_from_image(image, idx):
    base64_image = encode_pil_image(image)
    try:
        completion = client.chat.completions.create(
            model="o4-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        { "type": "text", "text": "Extract the text from this page and return it as markdown, with the best possible quality and accuracy." },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            },
                        },
                    ],
                }
            ],
        )
        return idx, completion.choices[0].message.content
    except Exception as e:
        print(e)
        return idx, f"Error processing page {idx}: {e}"

def pdf_to_outputs_with_progress(pdf_file, model_choice="openai", output_type="zip", page_from=1, page_to=None, progress=gr.Progress(track_tqdm=True)):
    # Save uploaded file to a temp path if needed
    if hasattr(pdf_file, "name"):
        pdf_path = pdf_file.name
    else:
        # Gradio may pass a str path or a file object
        pdf_path = pdf_file

    if model_choice == "openai":
        # OpenAI processing
        images = convert_from_path(pdf_path)
        num_pages = len(images)
        # Clamp page_from and page_to
        page_from = max(1, min(page_from, num_pages))
        page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
        selected_images = images[page_from-1:page_to]
        selected_indices = list(range(page_from-1, page_to))
        results = [None] * (page_to - page_from + 1)

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for i, img in enumerate(selected_images):
                futures.append(executor.submit(extract_markdown_from_image, img, i))
            for idx, future in enumerate(concurrent.futures.as_completed(futures)):
                idx_result, content = future.result()
                results[idx_result] = content.replace("```markdown", "").replace("```", "")
                progress((idx + 1) / len(selected_images), desc=f"Processing page {selected_indices[idx_result] + 1} of {num_pages}")

        output_json = [
            {"page": selected_indices[idx] + 1, "markdown": content}
            for idx, content in enumerate(results)
        ]
    else:
        # Mistral processing
        if not os.environ.get("MISTRAL_API_KEY"):
            raise ValueError("MISTRAL_API_KEY is not set")
            
        mistral_client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
        base64_pdf = encode_pdf(pdf_path)
        
        progress(0.1, desc="Sending PDF to Mistral API")
        ocr_response = mistral_client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": f"data:application/pdf;base64,{base64_pdf}" 
            }
        )
        num_pages = len(ocr_response.pages)
        page_from = max(1, min(page_from, num_pages))
        page_to = num_pages if page_to is None else max(page_from, min(page_to, num_pages))
        selected_pages = ocr_response.pages[page_from-1:page_to]
        selected_indices = list(range(page_from-1, page_to))

        progress(0.5, desc="Processing Mistral API response")
        results = []
        for page in selected_pages:
            results.append(page.markdown)
            
        output_json = [
            {"page": selected_indices[idx] + 1, "markdown": content}
            for idx, content in enumerate(results)
        ]
        progress(0.8, desc="Preparing output files")

    # Now handle output_type: "zip", "markdown", "json"
    temp_dir = tempfile.mkdtemp()
    output_paths = {}

    if output_type == "zip":
        md_folder = os.path.join(temp_dir, "pages")
        os.makedirs(md_folder, exist_ok=True)

        # Write each page as a separate .md file
        for idx, content in enumerate(results):
            md_path = os.path.join(md_folder, f"page_{selected_indices[idx]+1}.md")
            with open(md_path, "w", encoding="utf-8") as f:
                f.write(content.strip())

        # Write the JSON file
        output_json_path = os.path.join(temp_dir, "ocr_output.json")
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(output_json, f, ensure_ascii=False, indent=2)

        # Create a zip file containing the folder with md files and the json
        zip_path = os.path.join(temp_dir, "ocr_output.zip")
        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
            # Add the JSON file
            zipf.write(output_json_path, arcname="ocr_output.json")
            # Add the md files folder and its contents
            for root, dirs, files in os.walk(md_folder):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, temp_dir)
                    zipf.write(file_path, arcname=arcname)
        output_paths["zip"] = zip_path

    if output_type == "markdown":
        # Write all markdown into one file
        md_path = os.path.join(temp_dir, "ocr_output.md")
        with open(md_path, "w", encoding="utf-8") as f:
            for idx, content in enumerate(results):
                f.write(f"\n\n# Page {selected_indices[idx]+1}\n\n")
                f.write(content.strip())
                f.write("\n")
        output_paths["markdown"] = md_path

    if output_type == "json":
        # Write the JSON file
        output_json_path = os.path.join(temp_dir, "ocr_output.json")
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(output_json, f, ensure_ascii=False, indent=2)
        output_paths["json"] = output_json_path

    return output_paths, temp_dir

with gr.Blocks() as demo:
    gr.Markdown(
        "# PDF to Markdown & JSON OCR (OpenAI Vision)\n"
        "Upload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.\n\n"
        "You can also choose to download all results as a single markdown file or a single JSON file.\n\n"
        "**You can also select a range of pages to process.**"
    )
    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
    model_choice = gr.Radio(["openai", "mistral"], label="OCR Model", value="openai")
    # Add page range selectors
    page_from = gr.Number(label="Page From", value=1, precision=0, minimum=1)
    page_to = gr.Number(label="Page To (leave blank for last page)", value=None, precision=0, minimum=1)
    output_type = gr.Radio(
        ["zip", "markdown", "json"],
        label="Output Format",
        value="zip",
        info="Choose output: ZIP (md files + JSON), single Markdown file, or single JSON file"
    )
    zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False, visible=True)
    md_output = gr.File(label="Download Markdown", interactive=False, visible=False)
    json_output = gr.File(label="Download JSON", interactive=False, visible=False)

    def process_and_return_outputs(pdf_file, model_choice, output_type, page_from, page_to, progress=gr.Progress(track_tqdm=True)):
        # If page_to is None or blank, treat as last page
        page_from = int(page_from) if page_from is not None else 1
        page_to_val = int(page_to) if page_to not in (None, "") else None
        output_paths, temp_dir = pdf_to_outputs_with_progress(pdf_file, model_choice, output_type, page_from, page_to_val, progress=progress)
        # Return only the selected output, others as None
        zip_path = output_paths.get("zip") if output_type == "zip" else None
        md_path = output_paths.get("markdown") if output_type == "markdown" else None
        json_path = output_paths.get("json") if output_type == "json" else None
        return zip_path, md_path, json_path

    process_btn = gr.Button("Convert PDF")
    process_btn.click(
        process_and_return_outputs,
        inputs=[pdf_input, model_choice, output_type, page_from, page_to],
        outputs=[zip_output, md_output, json_output]
    )

    # Dynamically show/hide outputs based on output_type
    def update_output_visibility(output_type):
        return (
            gr.update(visible=(output_type == "zip")),
            gr.update(visible=(output_type == "markdown")),
            gr.update(visible=(output_type == "json")),
        )

    output_type.change(
        update_output_visibility,
        inputs=[output_type],
        outputs=[zip_output, md_output, json_output]
    )

demo.launch()