"""Gradio interface for DeepSeek-OCR on Hugging Face Spaces. This application loads the `deepseek-ai/DeepSeek-OCR` vision-language model and exposes a simple interface capable of processing both image and PDF documents. The implementation targets the Hugging Face free T4 GPU runtime and optimizes throughput with bfloat16 precision, flash-attention, and optional vLLM acceleration when available. """ from __future__ import annotations import contextlib import dataclasses import logging import os import shutil import subprocess import sys import tempfile from pathlib import Path from typing import List, Optional import gradio as gr import torch from PIL import Image from transformers import AutoModel, AutoTokenizer try: # Optional dependency for faster batching from vllm import LLM, SamplingParams # type: ignore _HAS_VLLM = True except Exception: # pragma: no cover - optional path LLM = None # type: ignore SamplingParams = None # type: ignore _HAS_VLLM = False try: import fitz # type: ignore[attr-defined] except Exception as exc: # pragma: no cover - ensures import error is visible raise RuntimeError( "PyMuPDF (fitz) is required for PDF processing. Install pymupdf." ) from exc logging.basicConfig(level=logging.INFO) LOGGER = logging.getLogger("deepseek_ocr_app") MODEL_NAME = "deepseek-ai/DeepSeek-OCR" DEFAULT_PROMPT = "\n<|grounding|>Convert the document to markdown." GUNDAM_CONFIG = { "base_size": 1024, "image_size": 640, "crop_mode": True, "test_compress": True, } @dataclasses.dataclass class PageResult: """Result for a single page processed by DeepSeek-OCR.""" index: int text: str @dataclasses.dataclass class DocumentResult: """Aggregate OCR result for an input document.""" filename: str page_results: List[PageResult] def to_markdown(self) -> str: sections = [] for page in self.page_results: heading = f"### Page {page.index}" sections.append(f"{heading}\n\n{page.text.strip()}".strip()) return "\n\n".join(sections).strip() def has_cuda() -> bool: return torch.cuda.is_available() class DeepSeekOCREngine: """Wrapper around the DeepSeek-OCR model for document processing.""" def __init__( self, model_name: str = MODEL_NAME, prompt: str = DEFAULT_PROMPT, config: Optional[dict] = None, enable_vllm: bool = True, ) -> None: self.model_name = model_name self.prompt_template = prompt self.config = {**GUNDAM_CONFIG, **(config or {})} self.enable_vllm = enable_vllm and _HAS_VLLM self.device = torch.device("cuda" if has_cuda() else "cpu") self._model = None self._tokenizer = None self._vllm_engine = None self._vllm_sampling_params = None self._output_root = Path(tempfile.mkdtemp(prefix="deepseek_ocr_out_")) self._load_model() @property def tokenizer(self): if self._tokenizer is None: raise RuntimeError("Tokenizer not initialized") return self._tokenizer @property def model(self): if self._model is None: raise RuntimeError("Model not initialized") return self._model def _load_model(self) -> None: torch.backends.cudnn.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True # Try to install flash-attn if not available and we're on CUDA if self.device.type == "cuda": self._ensure_flash_attention() if self.enable_vllm: try: LOGGER.info("Initializing DeepSeek-OCR with vLLM backend") self._vllm_engine = LLM( model=self.model_name, dtype="bfloat16" if has_cuda() else "float32", tokenizer=self.model_name, trust_remote_code=True, ) self._vllm_sampling_params = SamplingParams( temperature=0.0, top_p=0.9, max_tokens=4096, ) except Exception as vllm_error: LOGGER.warning( "vLLM initialization failed (%s). Falling back to HF AutoModel.", vllm_error, ) self.enable_vllm = False if not self.enable_vllm: LOGGER.info("Loading DeepSeek-OCR with transformers backend") self._tokenizer = AutoTokenizer.from_pretrained( self.model_name, trust_remote_code=True ) torch_dtype = torch.bfloat16 if self.device.type == "cuda" else torch.float32 # Try with flash attention first, fall back to standard attention attn_implementation = "flash_attention_2" if self._has_flash_attention() else "eager" if attn_implementation == "flash_attention_2": LOGGER.info("Using flash attention for faster inference") else: LOGGER.info("Using standard attention (flash attention not available)") self._model = AutoModel.from_pretrained( self.model_name, trust_remote_code=True, use_safetensors=True, _attn_implementation=attn_implementation, torch_dtype=torch_dtype, ) self._model = self._model.eval().to(self.device) def _ensure_flash_attention(self) -> None: """Ensure flash-attn is installed for CUDA devices.""" if not self._has_flash_attention(): LOGGER.info("Installing flash-attn for optimized attention") try: # Try installing flash-attn with no-build-isolation to avoid torch import issues result = subprocess.run([ sys.executable, "-m", "pip", "install", "flash-attn==2.7.3", "--no-build-isolation" ], capture_output=True, text=True, timeout=300) if result.returncode == 0: LOGGER.info("Successfully installed flash-attn") else: LOGGER.warning("Failed to install flash-attn: %s", result.stderr) except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e: LOGGER.warning("Failed to install flash-attn: %s", e) def _has_flash_attention(self) -> bool: """Check if flash attention is available.""" try: import flash_attn # noqa: F401 return True except ImportError: return False def cleanup(self) -> None: if self._output_root.exists(): shutil.rmtree(self._output_root, ignore_errors=True) def _infer_transformers(self, image_path: Path, prompt: str) -> str: result = self.model.infer( self.tokenizer, prompt=prompt, image_file=str(image_path), output_path=str(self._output_root), base_size=self.config["base_size"], image_size=self.config["image_size"], crop_mode=self.config["crop_mode"], save_results=False, test_compress=self.config.get("test_compress", True), ) if isinstance(result, dict): for key in ("text", "markdown", "raw_text", "result"): if key in result and isinstance(result[key], str): return result[key] return "\n".join(str(v) for v in result.values()) if isinstance(result, (list, tuple)): return "\n".join(str(item) for item in result) return str(result) def _infer_vllm(self, image_path: Path, prompt: str) -> str: if not self.enable_vllm or self._vllm_engine is None: raise RuntimeError("vLLM backend is not initialized") formatted_prompt = f"{prompt.replace('', '').strip()}" outputs = self._vllm_engine.generate( prompts=[formatted_prompt], image_data=[[Image.open(image_path)]], sampling_params=self._vllm_sampling_params, ) return outputs[0].outputs[0].text if outputs else "" def _infer(self, image_path: Path, prompt: str) -> str: if self.enable_vllm: try: return self._infer_vllm(image_path, prompt) except Exception as error: LOGGER.warning( "Falling back to transformers backend after vLLM error: %s", error, ) self.enable_vllm = False return self._infer_transformers(image_path, prompt) def _convert_pdf_to_images( self, pdf_path: Path, output_dir: Path, dpi: int = 192 ) -> List[Path]: document = fitz.open(pdf_path) image_paths: List[Path] = [] zoom = dpi / 72 # Default PDF DPI is 72 matrix = fitz.Matrix(zoom, zoom) for page_index in range(len(document)): page = document.load_page(page_index) pixmap = page.get_pixmap(matrix=matrix, alpha=False) page_path = output_dir / f"page-{page_index + 1:04d}.png" pixmap.save(page_path) image_paths.append(page_path) document.close() return image_paths def process_document( self, file_path: Path, prompt: Optional[str] = None, progress: Optional[gr.Progress] = None, ) -> DocumentResult: prompt_to_use = prompt.strip() if prompt and prompt.strip() else self.prompt_template suffix = file_path.suffix.lower() with tempfile.TemporaryDirectory(prefix="deepseek_ocr_tmp_") as tmp_dir: tmp_dir_path = Path(tmp_dir) if suffix in {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff"}: image_paths = [self._ensure_rgb_image(file_path, tmp_dir_path)] elif suffix == ".pdf": if progress: progress(0.0, desc="Converting PDF pages") image_paths = self._convert_pdf_to_images(file_path, tmp_dir_path) else: raise ValueError("Unsupported file format. Please upload an image or PDF.") total_pages = len(image_paths) page_results: List[PageResult] = [] for idx, image_path in enumerate(image_paths, start=1): if progress: progress( (idx - 1) / max(total_pages, 1), desc=f"Processing page {idx}/{total_pages}" ) text = self._infer(image_path, prompt_to_use) page_results.append(PageResult(index=idx, text=text)) if progress: progress(1.0, desc="Completed") return DocumentResult(filename=file_path.name, page_results=page_results) def _ensure_rgb_image(self, image_path: Path, output_dir: Path) -> Path: """Ensure the provided image is saved as RGB PNG for the model.""" image = Image.open(image_path) if image.mode != "RGB": image = image.convert("RGB") output_path = output_dir / f"image-{image_path.stem}.png" image.save(output_path, format="PNG", optimize=True) return output_path @contextlib.contextmanager def progress_tracker(progress: Optional[gr.Progress]): yield progress if progress else None ENGINE: Optional[DeepSeekOCREngine] = None def get_engine() -> DeepSeekOCREngine: global ENGINE if ENGINE is None: use_vllm_env = os.getenv("USE_VLLM", "1").strip().lower() enable_vllm = use_vllm_env not in {"0", "false", "no"} LOGGER.info("Instantiating DeepSeek-OCR engine (vLLM=%s)", enable_vllm) ENGINE = DeepSeekOCREngine(enable_vllm=enable_vllm) return ENGINE def handle_upload( file: gr.File | None, prompt: str, progress: gr.Progress = gr.Progress(track_tqdm=True), ) -> str: if file is None: raise gr.Error("Please upload an image or PDF file to start OCR.") uploaded_path = Path(file.name) fd, tmp_path_str = tempfile.mkstemp( prefix="deepseek_upload_", suffix=uploaded_path.suffix, ) os.close(fd) tmp_copy = Path(tmp_path_str) shutil.copy(uploaded_path, tmp_copy) engine = get_engine() try: with progress_tracker(progress) as tracker: result = engine.process_document(tmp_copy, prompt=prompt, progress=tracker) finally: tmp_copy.unlink(missing_ok=True) return result.to_markdown() def build_interface() -> gr.Blocks: description = ( "Upload an image or PDF and DeepSeek-OCR will transcribe it into Markdown. " "Optimized for Hugging Face free T4 GPU Spaces with flash-attention and " "optional vLLM acceleration." ) with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo: gr.Markdown("# DeepSeek-OCR PDF & Image Reader") gr.Markdown(description) with gr.Row(equal_height=False): with gr.Column(scale=1): file_input = gr.File( label="Upload document", file_count="single", type="file", file_types=[".png", ".jpg", ".jpeg", ".pdf", ".bmp", ".webp", ".tiff", ".tif"], ) prompt_box = gr.Textbox( label="Prompt", value=DEFAULT_PROMPT, lines=3, show_label=True, placeholder="Enter the grounding instruction for OCR", ) submit_btn = gr.Button("Run OCR", variant="primary") with gr.Column(scale=1): result_output = gr.Markdown(label="OCR Markdown Output") submit_btn.click( fn=handle_upload, inputs=[file_input, prompt_box], outputs=[result_output], ) return demo demo = build_interface() if __name__ == "__main__": demo.queue(concurrency_count=2, status_tracker=False).launch()