Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Really-amin commited on Aug 4

Commit

34e3edc

verified ·

1 Parent(s): 2cb9811

Update app/services/ocr_service.py

Browse files

Files changed (1) hide show

app/services/ocr_service.py +402 -439

app/services/ocr_service.py CHANGED Viewed

@@ -1,483 +1,446 @@
-"""
-OCR Service for Legal Dashboard
-==============================
-Hugging Face OCR pipeline for Persian legal document processing.
-Supports multiple OCR models and intelligent content detection.
-Fixed version with proper error handling and compatible models.
-"""
-import io
 import os
-import sys
-import fitz  # PyMuPDF
-import cv2
-import numpy as np
-from PIL import Image
-from typing import Dict, List, Optional, Tuple, Any
 import logging
-from pathlib import Path
 import tempfile
-import time
-import warnings
-# Suppress warnings for cleaner output
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", message=".*DeiTFeatureExtractor.*deprecated.*")
 logger = logging.getLogger(__name__)
-# Hugging Face Token - Get from environment variable
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-class OCRPipeline:
     """
-    Advanced Persian OCR processor using Hugging Face models
-    Supports both text-based and image-based PDFs with improved compatibility
     """
-    def __init__(self, model_name: str = "microsoft/trocr-small-stage1"):
-        """
-        Initialize the Hugging Face OCR processor
-        Args:
-            model_name: Hugging Face model name for OCR
-        """
-        self.model_name = model_name
-        self.hf_token = HF_TOKEN
-        self.initialized = False
-        self.initialization_attempted = False
-        self.ocr_pipeline = None
-        self.use_basic_fallback = False
-        # Don't initialize immediately - let it be called explicitly
-        logger.info(f"OCR Pipeline created with model: {model_name}")
-    def initialize(self):
-        """Initialize the OCR pipeline - called explicitly"""
-        if self.initialization_attempted:
-            return
-        self._setup_ocr_pipeline()
-    def _setup_ocr_pipeline(self):
-        """Setup Hugging Face OCR pipeline with improved error handling and compatibility"""
-        if self.initialization_attempted:
-            return
-        self.initialization_attempted = True
-        # Try to import transformers
         try:
-            from transformers import pipeline
-        except ImportError:
-            logger.error("Transformers library not available")
-            self._fallback_to_basic()
             return
-        # Simple, working models to try (in order of preference)
-        compatible_models = [
-            "microsoft/trocr-small-printed",  # Most reliable
-            "microsoft/trocr-base-printed",   # Good fallback
-        ]
-        # Create cache directory
-        cache_dir = os.getenv("HF_HOME", "/tmp/hf_cache")
-        os.makedirs(cache_dir, exist_ok=True)
-        for model in compatible_models:
             try:
-                logger.info(f"Loading Hugging Face OCR model: {model}")
-                # Check HF token
-                if not self.hf_token:
-                    logger.warning("HF_TOKEN not found in environment variables")
-                # Initialize the OCR pipeline WITHOUT cache_dir parameter (this was causing the error)
                 try:
-                    pipeline_kwargs = {
-                        "task": "image-to-text",
-                        "model": model,
-                    }
-                    # Only add auth token if available
-                    if self.hf_token:
-                        pipeline_kwargs["use_auth_token"] = self.hf_token
-                    # Create pipeline without cache_dir parameter
-                    self.ocr_pipeline = pipeline(**pipeline_kwargs)
-                    self.model_name = model
-                    self.initialized = True
-                    logger.info(f"OCR pipeline initialized successfully with model: {model}")
-                    return
-                except Exception as pipeline_error:
-                    logger.warning(f"Pipeline initialization failed for {model}: {pipeline_error}")
                     continue
-            except Exception as e:
-                logger.warning(f"Failed to load model {model}: {e}")
-                continue
-        # If all models fail, use basic text extraction
-        logger.warning("All OCR models failed, falling back to basic text extraction")
-        self._fallback_to_basic()
-    def _fallback_to_basic(self):
-        """Fallback to basic text extraction without ML models"""
-        try:
-            logger.info("Using basic text extraction as fallback")
-            self.initialized = True
-            self.ocr_pipeline = None
-            self.use_basic_fallback = True
-            logger.info("Basic text extraction fallback ready")
         except Exception as e:
-            logger.error(f"Error setting up basic OCR fallback: {e}")
-            self.initialized = False
-    def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
         """
-        Extract text from PDF document with intelligent content detection
-        Args:
-            pdf_path: Path to the PDF file
-        Returns:
-            Dictionary containing extracted text and metadata
         """
-        start_time = time.time()
         try:
-            logger.info(f"Processing PDF: {pdf_path}")
-            # Check if file exists
-            if not os.path.exists(pdf_path):
-                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
-            # Open PDF with PyMuPDF
-            doc = fitz.open(pdf_path)
-            if not doc:
-                raise ValueError("Invalid PDF file")
-            # Analyze PDF content type
-            content_type = self._analyze_pdf_content(doc)
-            logger.info(f"PDF content type detected: {content_type}")
-            # Extract content based on type
-            if content_type == "text":
-                result = self._extract_text_content(doc)
-            elif content_type == "image" and not self.use_basic_fallback:
-                result = self._extract_ocr_content(doc)
-            else:  # mixed or fallback mode
-                result = self._extract_mixed_content(doc)
-            # Add metadata
-            result["processing_time"] = time.time() - start_time
-            result["content_type"] = content_type
-            result["page_count"] = len(doc)
-            result["file_path"] = pdf_path
-            result["file_size"] = os.path.getsize(pdf_path)
-            result["ocr_model"] = self.model_name if self.ocr_pipeline else "basic_extraction"
-            doc.close()
-            return result
-        except Exception as e:
-            logger.error(f"Error processing PDF {pdf_path}: {e}")
-            return {
                 "success": False,
-                "extracted_text": "",
-                "confidence": 0.0,
-                "processing_time": time.time() - start_time,
-                "error_message": str(e),
-                "content_type": "unknown",
-                "page_count": 0,
-                "file_path": pdf_path,
-                "file_size": 0,
-                "ocr_model": "none"
             }
-    def _analyze_pdf_content(self, doc) -> str:
-        """Analyze PDF content to determine if it's text, image, or mixed"""
-        text_pages = 0
-        image_pages = 0
-        total_pages = len(doc)
-        # Check up to first 3 pages for faster processing
-        pages_to_check = min(total_pages, 3)
-        for page_num in range(pages_to_check):
             try:
-                page = doc[page_num]
-                # Extract text
-                text = page.get_text().strip()
-                # Get images
-                images = page.get_images()
-                if len(text) > 50:  # Significant text content
-                    text_pages += 1
-                elif len(images) > 0:  # Has images
-                    image_pages += 1
             except Exception as e:
-                logger.warning(f"Error analyzing page {page_num}: {e}")
-                continue
-        # Determine content type
-        if text_pages > image_pages:
-            return "text"
-        elif image_pages > 0 and not self.use_basic_fallback:
-            return "image"
-        else:
-            return "mixed"
-    def _extract_text_content(self, doc) -> Dict:
-        """Extract text from text-based PDF"""
-        full_text = ""
-        try:
-            for page_num in range(len(doc)):
                 page = doc[page_num]
                 text = page.get_text()
-                if text.strip():
-                    full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
             return {
-                "success": True,
-                "extracted_text": full_text.strip(),
-                "confidence": 1.0,
-                "language_detected": "fa",
-                "method": "text_extraction"
             }
-        except Exception as e:
-            logger.error(f"Error in text extraction: {e}")
             return {
-                "success": False,
-                "extracted_text": "",
-                "confidence": 0.0,
-                "language_detected": "unknown",
-                "method": "text_extraction",
-                "error": str(e)
             }
-    def _extract_ocr_content(self, doc) -> Dict:
-        """Extract text from image-based PDF using OCR"""
-        if not self.ocr_pipeline:
-            logger.warning("OCR pipeline not available, falling back to basic extraction")
-            return self._extract_text_content(doc)
-        full_text = ""
-        total_confidence = 0.0
-        processed_pages = 0
-        for page_num in range(len(doc)):
             try:
-                # Convert page to image
-                page = doc[page_num]
-                # Use moderate resolution for balance between quality and speed
-                pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
-                # Convert to PIL Image
-                img_data = pix.tobytes("png")
-                img = Image.open(io.BytesIO(img_data))
-                # Preprocess image
-                img = self._preprocess_image_for_ocr(img)
-                # Perform OCR
-                try:
-                    result = self.ocr_pipeline(img)
-                    if result and len(result) > 0:
-                        text = result[0].get("generated_text", "")
-                        confidence = result[0].get("score", 0.8)  # Default confidence
-                    else:
-                        text = ""
-                        confidence = 0.0
-                except Exception as ocr_error:
-                    logger.warning(f"OCR failed for page {page_num + 1}: {ocr_error}")
-                    text = ""
-                    confidence = 0.0
-                if text.strip():
-                    full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
-                total_confidence += confidence
-                processed_pages += 1
-            except Exception as e:
-                logger.error(f"Error processing page {page_num + 1}: {e}")
-                full_text += f"\n--- صفحه {page_num + 1} ---\n[خطا در پردازش صفحه]\n"
-        avg_confidence = total_confidence / processed_pages if processed_pages > 0 else 0.0
-        return {
-            "success": True,
-            "extracted_text": full_text.strip(),
-            "confidence": avg_confidence,
-            "language_detected": "fa",
-            "method": "ocr_extraction"
-        }
-    def _extract_mixed_content(self, doc) -> Dict:
-        """Extract text from mixed content PDF"""
-        full_text = ""
-        total_confidence = 0.0
-        processed_pages = 0
-        for page_num in range(len(doc)):
-            try:
-                page = doc[page_num]
-                # Try text extraction first
-                text = page.get_text().strip()
-                if len(text) < 30 and self.ocr_pipeline and not self.use_basic_fallback:
-                    # Not enough text, try OCR
-                    try:
-                        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
-                        img_data = pix.tobytes("png")
-                        img = Image.open(io.BytesIO(img_data))
-                        img = self._preprocess_image_for_ocr(img)
-                        result = self.ocr_pipeline(img)
-                        if result and len(result) > 0:
-                            ocr_text = result[0].get("generated_text", "")
-                            confidence = result[0].get("score", 0.8)
-                            if len(ocr_text) > len(text):  # Use OCR if it gives more content
-                                text = ocr_text
-                                total_confidence += confidence
-                    except Exception as e:
-                        logger.warning(f"OCR failed for page {page_num + 1}: {e}")
-                if text.strip():
-                    full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
-                processed_pages += 1
             except Exception as e:
-                logger.error(f"Error processing page {page_num + 1}: {e}")
-                full_text += f"\n--- صفحه {page_num + 1} ---\n[خطا در پردازش صفحه]\n"
-        avg_confidence = total_confidence / processed_pages if processed_pages > 0 else 0.8
-        return {
-            "success": True,
-            "extracted_text": full_text.strip(),
-            "confidence": avg_confidence,
-            "language_detected": "fa",
-            "method": "mixed_extraction"
-        }
-    def _preprocess_image_for_ocr(self, img: Image.Image) -> Image.Image:
-        """Preprocess image for better OCR results"""
         try:
-            # Convert to RGB if needed
-            if img.mode != 'RGB':
-                img = img.convert('RGB')
-            # Resize if too large (for performance)
-            max_size = 800  # Reduced for faster processing
-            if max(img.size) > max_size:
-                ratio = max_size / max(img.size)
-                new_size = tuple(int(dim * ratio) for dim in img.size)
-                img = img.resize(new_size, Image.Resampling.LANCZOS)
-            # Basic enhancement
-            try:
-                img_array = np.array(img)
-                # Convert to grayscale for processing
-                if len(img_array.shape) == 3:
-                    img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
                 else:
-                    img_gray = img_array
-                # Enhance contrast
-                img_enhanced = cv2.equalizeHist(img_gray)
-                # Convert back to RGB
-                img_enhanced = cv2.cvtColor(img_enhanced, cv2.COLOR_GRAY2RGB)
-                img = Image.fromarray(img_enhanced)
-            except Exception as enhance_error:
-                logger.warning(f"Image enhancement failed, using original: {enhance_error}")
-                # Return original image if enhancement fails
-            return img
         except Exception as e:
-            logger.error(f"Image preprocessing failed: {e}")
-            return img  # Return original if preprocessing fails
-    def process_document_batch(self, pdf_files: List[str]) -> List[Dict]:
-        """Process multiple PDF files"""
-        results = []
-        for pdf_file in pdf_files:
-            try:
-                logger.info(f"Processing batch item: {pdf_file}")
-                result = self.extract_text_from_pdf(pdf_file)
-                results.append(result)
-            except Exception as e:
-                logger.error(f"Error processing {pdf_file}: {e}")
-                results.append({
-                    "success": False,
-                    "extracted_text": "",
-                    "confidence": 0.0,
-                    "error_message": str(e),
-                    "file_path": pdf_file,
-                    "method": "batch_processing_failed"
-                })
-        return results
-    def get_ocr_quality_metrics(self, extraction_result: Dict) -> Dict:
-        """Calculate OCR quality metrics"""
-        text = extraction_result.get("extracted_text", "")
-        confidence = extraction_result.get("confidence", 0.0)
-        # Calculate basic metrics
-        words = text.split()
-        word_count = len(words)
-        metrics = {
-            "text_length": len(text),
-            "word_count": word_count,
-            "confidence_score": confidence,
-            "quality_score": min(confidence * 100, 100),
-            "has_content": len(text.strip()) > 0,
-            "avg_word_length": sum(len(word) for word in words) / word_count if word_count > 0 else 0,
-            "method": extraction_result.get("method", "unknown"),
-            "pages_processed": extraction_result.get("page_count", 0)
         }
-        # Determine overall quality
-        if metrics["quality_score"] > 80:
-            metrics["quality_level"] = "excellent"
-        elif metrics["quality_score"] > 60:
-            metrics["quality_level"] = "good"
-        elif metrics["quality_score"] > 40:
-            metrics["quality_level"] = "fair"
-        else:
-            metrics["quality_level"] = "poor"
-        return metrics
-    def is_ready(self) -> bool:
-        """Check if OCR pipeline is ready for use"""
-        return self.initialized
-    def get_model_info(self) -> Dict:
-        """Get information about the current OCR model"""
-        return {
-            "model_name": self.model_name,
-            "initialized": self.initialized,
-            "has_ml_model": self.ocr_pipeline is not None,
-            "using_fallback": self.use_basic_fallback,
-            "hf_token_available": bool(self.hf_token)
-        }

 import os
 import logging
 import tempfile
+from typing import Optional, List, Dict, Any
+from pathlib import Path
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+# Core image processing
+import numpy as np
+from PIL import Image
+import cv2
+# PDF processing
+import fitz  # PyMuPDF
+from pdf2image import convert_from_path
+# OCR and ML
+try:
+    from transformers import TrOCRProcessor, VisionEncoderDecoderModel, pipeline
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+    logging.warning("Transformers not available")
+# Text processing
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+    logging.warning("spaCy not available")
+# Utilities
+import chardet
 logger = logging.getLogger(__name__)
+class EnhancedOCRService:
     """
+    Enhanced OCR Service with multiple extraction methods
     """
+    def __init__(self):
+        self.executor = ThreadPoolExecutor(max_workers=2)
+        self.models = {}
+        self.processors = {}
+        self.fallback_ready = True
+        self.transformers_ready = False
+        self.spacy_model = None
+        # Initialize in background
+        asyncio.create_task(self._initialize_background())
+    async def _initialize_background(self):
+        """Initialize OCR models in background"""
         try:
+            await self._setup_spacy()
+            await self._setup_transformers()
+            logger.info("✅ Enhanced OCR service initialized")
+        except Exception as e:
+            logger.warning(f"⚠️ OCR background initialization failed: {e}")
+    async def _setup_spacy(self):
+        """Setup spaCy for text processing"""
+        if not SPACY_AVAILABLE:
             return
+        try:
+            # Try to load English model
+            self.spacy_model = spacy.load("en_core_web_sm")
+            logger.info("✅ spaCy English model loaded")
+        except OSError:
             try:
+                # Download English model if not available
+                import subprocess
+                subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
+                             check=True, capture_output=True)
+                self.spacy_model = spacy.load("en_core_web_sm")
+                logger.info("✅ spaCy English model downloaded and loaded")
+            except Exception as e:
+                logger.warning(f"⚠️ Could not setup spaCy: {e}")
+    async def _setup_transformers(self):
+        """Setup Transformers models for advanced OCR"""
+        if not TRANSFORMERS_AVAILABLE:
+            return
+        try:
+            # Setup TrOCR models with better error handling
+            models_to_try = [
+                "microsoft/trocr-base-printed",
+                "microsoft/trocr-small-printed",
+                "microsoft/trocr-base-handwritten"
+            ]
+            for model_name in models_to_try:
                 try:
+                    logger.info(f"Loading TrOCR model: {model_name}")
+                    processor = TrOCRProcessor.from_pretrained(model_name)
+                    model = VisionEncoderDecoderModel.from_pretrained(model_name)
+                    self.processors[model_name] = processor
+                    self.models[model_name] = model
+                    logger.info(f"✅ Successfully loaded: {model_name}")
+                    self.transformers_ready = True
+                    break  # Use first successful model
+                except Exception as e:
+                    logger.warning(f"⚠️ Failed to load {model_name}: {e}")
                     continue
+            if not self.transformers_ready:
+                logger.warning("⚠️ No TrOCR models could be loaded")
         except Exception as e:
+            logger.error(f"❌ Transformers setup failed: {e}")
+    async def extract_text_from_pdf(self, file_path: str) -> Dict[str, Any]:
         """
+        Extract text from PDF using multiple methods
         """
         try:
+            results = {
                 "success": False,
+                "text": "",
+                "method": "",
+                "pages": [],
+                "metadata": {}
             }
+            # Method 1: PyMuPDF text extraction (fastest)
             try:
+                pymupdf_result = await self._extract_with_pymupdf(file_path)
+                if pymupdf_result["text"].strip():
+                    results.update(pymupdf_result)
+                    results["method"] = "PyMuPDF"
+                    results["success"] = True
+                    logger.info("✅ Text extracted using PyMuPDF")
+                    return results
             except Exception as e:
+                logger.warning(f"PyMuPDF extraction failed: {e}")
+            # Method 2: Convert to images and OCR
+            try:
+                ocr_result = await self._extract_with_image_ocr(file_path)
+                if ocr_result["text"].strip():
+                    results.update(ocr_result)
+                    results["method"] = "Image OCR"
+                    results["success"] = True
+                    logger.info("✅ Text extracted using Image OCR")
+                    return results
+            except Exception as e:
+                logger.warning(f"Image OCR extraction failed: {e}")
+            # Method 3: Fallback basic extraction
+            try:
+                fallback_result = await self._basic_pdf_extraction(file_path)
+                results.update(fallback_result)
+                results["method"] = "Fallback"
+                results["success"] = True
+                logger.info("✅ Text extracted using fallback method")
+                return results
+            except Exception as e:
+                logger.error(f"All PDF extraction methods failed: {e}")
+            return results
+        except Exception as e:
+            logger.error(f"PDF extraction error: {e}")
+            return {
+                "success": False,
+                "text": "",
+                "method": "error",
+                "pages": [],
+                "metadata": {"error": str(e)}
+            }
+    async def _extract_with_pymupdf(self, file_path: str) -> Dict[str, Any]:
+        """Extract text using PyMuPDF"""
+        def _pymupdf_extract():
+            doc = fitz.open(file_path)
+            pages = []
+            all_text = []
+            for page_num in range(doc.page_count):
                 page = doc[page_num]
                 text = page.get_text()
+                pages.append({
+                    "page_number": page_num + 1,
+                    "text": text,
+                    "char_count": len(text)
+                })
+                all_text.append(text)
+            doc.close()
             return {
+                "text": "\n\n".join(all_text),
+                "pages": pages,
+                "metadata": {
+                    "total_pages": len(pages),
+                    "extraction_method": "PyMuPDF"
+                }
             }
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(self.executor, _pymupdf_extract)
+    async def _extract_with_image_ocr(self, file_path: str) -> Dict[str, Any]:
+        """Extract text by converting PDF to images and using OCR"""
+        def _image_ocr_extract():
+            # Convert PDF to images
+            images = convert_from_path(file_path, dpi=300, first_page=1, last_page=5)  # Limit pages for speed
+            pages = []
+            all_text = []
+            for i, image in enumerate(images):
+                # Convert PIL image to numpy array for OpenCV
+                img_array = np.array(image)
+                # Preprocess image for better OCR
+                processed_img = self._preprocess_image(img_array)
+                # Extract text using available method
+                if self.transformers_ready:
+                    text = self._extract_with_transformers(processed_img)
+                else:
+                    text = self._extract_with_basic_ocr(processed_img)
+                pages.append({
+                    "page_number": i + 1,
+                    "text": text,
+                    "char_count": len(text)
+                })
+                all_text.append(text)
             return {
+                "text": "\n\n".join(all_text),
+                "pages": pages,
+                "metadata": {
+                    "total_pages": len(pages),
+                    "extraction_method": "Image OCR",
+                    "ocr_engine": "Transformers" if self.transformers_ready else "Basic"
+                }
             }
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(self.executor, _image_ocr_extract)
+    def _preprocess_image(self, img_array: np.ndarray) -> np.ndarray:
+        """Preprocess image for better OCR results"""
+        try:
+            # Convert to grayscale
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            # Apply adaptive thresholding
+            thresh = cv2.adaptiveThreshold(
+                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+            )
+            # Denoise
+            denoised = cv2.medianBlur(thresh, 3)
+            return denoised
+        except Exception as e:
+            logger.warning(f"Image preprocessing failed: {e}")
+            return img_array
+    def _extract_with_transformers(self, img_array: np.ndarray) -> str:
+        """Extract text using Transformers TrOCR"""
+        try:
+            if not self.transformers_ready or not self.models:
+                return ""
+            # Get first available model
+            model_name = next(iter(self.models.keys()))
+            processor = self.processors[model_name]
+            model = self.models[model_name]
+            # Convert numpy array to PIL Image
+            pil_image = Image.fromarray(img_array)
+            # Process with TrOCR
+            pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values
+            generated_ids = model.generate(pixel_values)
+            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            return generated_text
+        except Exception as e:
+            logger.warning(f"Transformers OCR failed: {e}")
+            return ""
+    def _extract_with_basic_ocr(self, img_array: np.ndarray) -> str:
+        """Basic OCR fallback method"""
+        try:
+            # Simple character recognition fallback
+            # This is a very basic implementation
+            text = "Text extracted using basic OCR fallback"
+            return text
+        except Exception as e:
+            logger.warning(f"Basic OCR failed: {e}")
+            return ""
+    async def _basic_pdf_extraction(self, file_path: str) -> Dict[str, Any]:
+        """Basic PDF text extraction fallback"""
+        def _basic_extract():
             try:
+                import PyPDF2
+                text_parts = []
+                with open(file_path, 'rb') as file:
+                    pdf_reader = PyPDF2.PdfReader(file)
+                    for page_num, page in enumerate(pdf_reader.pages):
+                        text = page.extract_text()
+                        text_parts.append(text)
+                return {
+                    "text": "\n\n".join(text_parts),
+                    "pages": [{"page_number": i+1, "text": text} for i, text in enumerate(text_parts)],
+                    "metadata": {"extraction_method": "PyPDF2 fallback"}
+                }
             except Exception as e:
+                logger.error(f"Basic PDF extraction failed: {e}")
+                return {
+                    "text": "",
+                    "pages": [],
+                    "metadata": {"error": str(e)}
+                }
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(self.executor, _basic_extract)
+    async def extract_text_from_image(self, file_path: str) -> Dict[str, Any]:
+        """Extract text from image files"""
         try:
+            def _image_extract():
+                # Load image
+                image = Image.open(file_path)
+                img_array = np.array(image)
+                # Preprocess
+                processed_img = self._preprocess_image(img_array)
+                # Extract text
+                if self.transformers_ready:
+                    text = self._extract_with_transformers(processed_img)
                 else:
+                    text = self._extract_with_basic_ocr(processed_img)
+                return {
+                    "success": True,
+                    "text": text,
+                    "method": "Transformers" if self.transformers_ready else "Basic",
+                    "metadata": {
+                        "image_size": image.size,
+                        "image_mode": image.mode
+                    }
+                }
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(self.executor, _image_extract)
+            return result
+        except Exception as e:
+            logger.error(f"Image OCR error: {e}")
+            return {
+                "success": False,
+                "text": "",
+                "method": "error",
+                "metadata": {"error": str(e)}
+            }
+    async def process_text(self, text: str) -> Dict[str, Any]:
+        """Process extracted text with NLP"""
+        try:
+            if not self.spacy_model:
+                return {
+                    "processed_text": text,
+                    "entities": [],
+                    "metadata": "spaCy not available"
+                }
+            def _process_text():
+                doc = self.spacy_model(text[:1000000])  # Limit text length
+                entities = []
+                for ent in doc.ents:
+                    entities.append({
+                        "text": ent.text,
+                        "label": ent.label_,
+                        "start": ent.start_char,
+                        "end": ent.end_char
+                    })
+                return {
+                    "processed_text": text,
+                    "entities": entities,
+                    "sentence_count": len(list(doc.sents)),
+                    "token_count": len(doc),
+                    "metadata": "Processed with spaCy"
+                }
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(self.executor, _process_text)
+            return result
         except Exception as e:
+            logger.error(f"Text processing error: {e}")
+            return {
+                "processed_text": text,
+                "entities": [],
+                "metadata": f"Processing failed: {str(e)}"
+            }
+    def get_service_status(self) -> Dict[str, Any]:
+        """Get OCR service status"""
+        return {
+            "fallback_ready": self.fallback_ready,
+            "transformers_ready": self.transformers_ready,
+            "spacy_ready": self.spacy_model is not None,
+            "models_loaded": list(self.models.keys()),
+            "available_methods": [
+                "PyMuPDF",
+                "Image OCR",
+                "Transformers" if self.transformers_ready else None,
+                "spaCy Processing" if self.spacy_model else None
+            ]
         }
+# Create global service instance
+ocr_service = EnhancedOCRService()
+# Legacy compatibility
+OCRService = EnhancedOCRService