Really-amin commited on
Commit
34e3edc
·
verified ·
1 Parent(s): 2cb9811

Update app/services/ocr_service.py

Browse files
Files changed (1) hide show
  1. app/services/ocr_service.py +402 -439
app/services/ocr_service.py CHANGED
@@ -1,483 +1,446 @@
1
- """
2
- OCR Service for Legal Dashboard
3
- ==============================
4
-
5
- Hugging Face OCR pipeline for Persian legal document processing.
6
- Supports multiple OCR models and intelligent content detection.
7
- Fixed version with proper error handling and compatible models.
8
- """
9
-
10
- import io
11
  import os
12
- import sys
13
- import fitz # PyMuPDF
14
- import cv2
15
- import numpy as np
16
- from PIL import Image
17
- from typing import Dict, List, Optional, Tuple, Any
18
  import logging
19
- from pathlib import Path
20
  import tempfile
21
- import time
22
- import warnings
 
 
 
 
 
 
 
23
 
24
- # Suppress warnings for cleaner output
25
- warnings.filterwarnings("ignore", category=FutureWarning)
26
- warnings.filterwarnings("ignore", message=".*DeiTFeatureExtractor.*deprecated.*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  logger = logging.getLogger(__name__)
29
 
30
- # Hugging Face Token - Get from environment variable
31
- HF_TOKEN = os.getenv("HF_TOKEN", "")
32
-
33
- class OCRPipeline:
34
  """
35
- Advanced Persian OCR processor using Hugging Face models
36
- Supports both text-based and image-based PDFs with improved compatibility
37
  """
38
-
39
- def __init__(self, model_name: str = "microsoft/trocr-small-stage1"):
40
- """
41
- Initialize the Hugging Face OCR processor
42
-
43
- Args:
44
- model_name: Hugging Face model name for OCR
45
- """
46
- self.model_name = model_name
47
- self.hf_token = HF_TOKEN
48
- self.initialized = False
49
- self.initialization_attempted = False
50
- self.ocr_pipeline = None
51
- self.use_basic_fallback = False
52
-
53
- # Don't initialize immediately - let it be called explicitly
54
- logger.info(f"OCR Pipeline created with model: {model_name}")
55
-
56
- def initialize(self):
57
- """Initialize the OCR pipeline - called explicitly"""
58
- if self.initialization_attempted:
59
- return
60
-
61
- self._setup_ocr_pipeline()
62
-
63
- def _setup_ocr_pipeline(self):
64
- """Setup Hugging Face OCR pipeline with improved error handling and compatibility"""
65
- if self.initialization_attempted:
66
- return
67
-
68
- self.initialization_attempted = True
69
-
70
- # Try to import transformers
71
  try:
72
- from transformers import pipeline
73
- except ImportError:
74
- logger.error("Transformers library not available")
75
- self._fallback_to_basic()
 
 
 
 
 
76
  return
77
-
78
- # Simple, working models to try (in order of preference)
79
- compatible_models = [
80
- "microsoft/trocr-small-printed", # Most reliable
81
- "microsoft/trocr-base-printed", # Good fallback
82
- ]
83
-
84
- # Create cache directory
85
- cache_dir = os.getenv("HF_HOME", "/tmp/hf_cache")
86
- os.makedirs(cache_dir, exist_ok=True)
87
-
88
- for model in compatible_models:
89
  try:
90
- logger.info(f"Loading Hugging Face OCR model: {model}")
91
-
92
- # Check HF token
93
- if not self.hf_token:
94
- logger.warning("HF_TOKEN not found in environment variables")
95
-
96
- # Initialize the OCR pipeline WITHOUT cache_dir parameter (this was causing the error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  try:
98
- pipeline_kwargs = {
99
- "task": "image-to-text",
100
- "model": model,
101
- }
102
 
103
- # Only add auth token if available
104
- if self.hf_token:
105
- pipeline_kwargs["use_auth_token"] = self.hf_token
106
 
107
- # Create pipeline without cache_dir parameter
108
- self.ocr_pipeline = pipeline(**pipeline_kwargs)
109
-
110
- self.model_name = model
111
- self.initialized = True
112
- logger.info(f"OCR pipeline initialized successfully with model: {model}")
113
- return
114
-
115
- except Exception as pipeline_error:
116
- logger.warning(f"Pipeline initialization failed for {model}: {pipeline_error}")
117
  continue
118
-
119
- except Exception as e:
120
- logger.warning(f"Failed to load model {model}: {e}")
121
- continue
122
-
123
- # If all models fail, use basic text extraction
124
- logger.warning("All OCR models failed, falling back to basic text extraction")
125
- self._fallback_to_basic()
126
-
127
- def _fallback_to_basic(self):
128
- """Fallback to basic text extraction without ML models"""
129
- try:
130
- logger.info("Using basic text extraction as fallback")
131
- self.initialized = True
132
- self.ocr_pipeline = None
133
- self.use_basic_fallback = True
134
- logger.info("Basic text extraction fallback ready")
135
  except Exception as e:
136
- logger.error(f"Error setting up basic OCR fallback: {e}")
137
- self.initialized = False
138
-
139
- def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
140
  """
141
- Extract text from PDF document with intelligent content detection
142
-
143
- Args:
144
- pdf_path: Path to the PDF file
145
-
146
- Returns:
147
- Dictionary containing extracted text and metadata
148
  """
149
- start_time = time.time()
150
-
151
  try:
152
- logger.info(f"Processing PDF: {pdf_path}")
153
-
154
- # Check if file exists
155
- if not os.path.exists(pdf_path):
156
- raise FileNotFoundError(f"PDF file not found: {pdf_path}")
157
-
158
- # Open PDF with PyMuPDF
159
- doc = fitz.open(pdf_path)
160
-
161
- if not doc:
162
- raise ValueError("Invalid PDF file")
163
-
164
- # Analyze PDF content type
165
- content_type = self._analyze_pdf_content(doc)
166
- logger.info(f"PDF content type detected: {content_type}")
167
-
168
- # Extract content based on type
169
- if content_type == "text":
170
- result = self._extract_text_content(doc)
171
- elif content_type == "image" and not self.use_basic_fallback:
172
- result = self._extract_ocr_content(doc)
173
- else: # mixed or fallback mode
174
- result = self._extract_mixed_content(doc)
175
-
176
- # Add metadata
177
- result["processing_time"] = time.time() - start_time
178
- result["content_type"] = content_type
179
- result["page_count"] = len(doc)
180
- result["file_path"] = pdf_path
181
- result["file_size"] = os.path.getsize(pdf_path)
182
- result["ocr_model"] = self.model_name if self.ocr_pipeline else "basic_extraction"
183
-
184
- doc.close()
185
- return result
186
-
187
- except Exception as e:
188
- logger.error(f"Error processing PDF {pdf_path}: {e}")
189
- return {
190
  "success": False,
191
- "extracted_text": "",
192
- "confidence": 0.0,
193
- "processing_time": time.time() - start_time,
194
- "error_message": str(e),
195
- "content_type": "unknown",
196
- "page_count": 0,
197
- "file_path": pdf_path,
198
- "file_size": 0,
199
- "ocr_model": "none"
200
  }
201
-
202
- def _analyze_pdf_content(self, doc) -> str:
203
- """Analyze PDF content to determine if it's text, image, or mixed"""
204
- text_pages = 0
205
- image_pages = 0
206
- total_pages = len(doc)
207
-
208
- # Check up to first 3 pages for faster processing
209
- pages_to_check = min(total_pages, 3)
210
-
211
- for page_num in range(pages_to_check):
212
  try:
213
- page = doc[page_num]
214
-
215
- # Extract text
216
- text = page.get_text().strip()
217
-
218
- # Get images
219
- images = page.get_images()
220
-
221
- if len(text) > 50: # Significant text content
222
- text_pages += 1
223
- elif len(images) > 0: # Has images
224
- image_pages += 1
225
-
226
  except Exception as e:
227
- logger.warning(f"Error analyzing page {page_num}: {e}")
228
- continue
229
-
230
- # Determine content type
231
- if text_pages > image_pages:
232
- return "text"
233
- elif image_pages > 0 and not self.use_basic_fallback:
234
- return "image"
235
- else:
236
- return "mixed"
237
-
238
- def _extract_text_content(self, doc) -> Dict:
239
- """Extract text from text-based PDF"""
240
- full_text = ""
241
-
242
- try:
243
- for page_num in range(len(doc)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  page = doc[page_num]
245
  text = page.get_text()
246
- if text.strip():
247
- full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
248
-
 
 
 
 
 
 
249
  return {
250
- "success": True,
251
- "extracted_text": full_text.strip(),
252
- "confidence": 1.0,
253
- "language_detected": "fa",
254
- "method": "text_extraction"
 
255
  }
256
- except Exception as e:
257
- logger.error(f"Error in text extraction: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  return {
259
- "success": False,
260
- "extracted_text": "",
261
- "confidence": 0.0,
262
- "language_detected": "unknown",
263
- "method": "text_extraction",
264
- "error": str(e)
 
265
  }
266
-
267
- def _extract_ocr_content(self, doc) -> Dict:
268
- """Extract text from image-based PDF using OCR"""
269
- if not self.ocr_pipeline:
270
- logger.warning("OCR pipeline not available, falling back to basic extraction")
271
- return self._extract_text_content(doc)
272
-
273
- full_text = ""
274
- total_confidence = 0.0
275
- processed_pages = 0
276
-
277
- for page_num in range(len(doc)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  try:
279
- # Convert page to image
280
- page = doc[page_num]
281
- # Use moderate resolution for balance between quality and speed
282
- pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
283
-
284
- # Convert to PIL Image
285
- img_data = pix.tobytes("png")
286
- img = Image.open(io.BytesIO(img_data))
287
-
288
- # Preprocess image
289
- img = self._preprocess_image_for_ocr(img)
290
-
291
- # Perform OCR
292
- try:
293
- result = self.ocr_pipeline(img)
294
- if result and len(result) > 0:
295
- text = result[0].get("generated_text", "")
296
- confidence = result[0].get("score", 0.8) # Default confidence
297
- else:
298
- text = ""
299
- confidence = 0.0
300
- except Exception as ocr_error:
301
- logger.warning(f"OCR failed for page {page_num + 1}: {ocr_error}")
302
- text = ""
303
- confidence = 0.0
304
-
305
- if text.strip():
306
- full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
307
 
308
- total_confidence += confidence
309
- processed_pages += 1
310
-
311
- except Exception as e:
312
- logger.error(f"Error processing page {page_num + 1}: {e}")
313
- full_text += f"\n--- صفحه {page_num + 1} ---\n[خطا در پردازش صفحه]\n"
314
-
315
- avg_confidence = total_confidence / processed_pages if processed_pages > 0 else 0.0
316
-
317
- return {
318
- "success": True,
319
- "extracted_text": full_text.strip(),
320
- "confidence": avg_confidence,
321
- "language_detected": "fa",
322
- "method": "ocr_extraction"
323
- }
324
-
325
- def _extract_mixed_content(self, doc) -> Dict:
326
- """Extract text from mixed content PDF"""
327
- full_text = ""
328
- total_confidence = 0.0
329
- processed_pages = 0
330
-
331
- for page_num in range(len(doc)):
332
- try:
333
- page = doc[page_num]
334
-
335
- # Try text extraction first
336
- text = page.get_text().strip()
337
-
338
- if len(text) < 30 and self.ocr_pipeline and not self.use_basic_fallback:
339
- # Not enough text, try OCR
340
- try:
341
- pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
342
- img_data = pix.tobytes("png")
343
- img = Image.open(io.BytesIO(img_data))
344
- img = self._preprocess_image_for_ocr(img)
345
-
346
- result = self.ocr_pipeline(img)
347
- if result and len(result) > 0:
348
- ocr_text = result[0].get("generated_text", "")
349
- confidence = result[0].get("score", 0.8)
350
- if len(ocr_text) > len(text): # Use OCR if it gives more content
351
- text = ocr_text
352
- total_confidence += confidence
353
-
354
- except Exception as e:
355
- logger.warning(f"OCR failed for page {page_num + 1}: {e}")
356
-
357
- if text.strip():
358
- full_text += f"\n--- صفحه {page_num + 1} ---\n{text}\n"
359
 
360
- processed_pages += 1
361
-
362
  except Exception as e:
363
- logger.error(f"Error processing page {page_num + 1}: {e}")
364
- full_text += f"\n--- صفحه {page_num + 1} ---\n[خطا در پردازش صفحه]\n"
365
-
366
- avg_confidence = total_confidence / processed_pages if processed_pages > 0 else 0.8
367
-
368
- return {
369
- "success": True,
370
- "extracted_text": full_text.strip(),
371
- "confidence": avg_confidence,
372
- "language_detected": "fa",
373
- "method": "mixed_extraction"
374
- }
375
-
376
- def _preprocess_image_for_ocr(self, img: Image.Image) -> Image.Image:
377
- """Preprocess image for better OCR results"""
378
  try:
379
- # Convert to RGB if needed
380
- if img.mode != 'RGB':
381
- img = img.convert('RGB')
382
-
383
- # Resize if too large (for performance)
384
- max_size = 800 # Reduced for faster processing
385
- if max(img.size) > max_size:
386
- ratio = max_size / max(img.size)
387
- new_size = tuple(int(dim * ratio) for dim in img.size)
388
- img = img.resize(new_size, Image.Resampling.LANCZOS)
389
-
390
- # Basic enhancement
391
- try:
392
- img_array = np.array(img)
393
 
394
- # Convert to grayscale for processing
395
- if len(img_array.shape) == 3:
396
- img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
397
  else:
398
- img_gray = img_array
399
 
400
- # Enhance contrast
401
- img_enhanced = cv2.equalizeHist(img_gray)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- # Convert back to RGB
404
- img_enhanced = cv2.cvtColor(img_enhanced, cv2.COLOR_GRAY2RGB)
405
- img = Image.fromarray(img_enhanced)
 
 
 
 
 
406
 
407
- except Exception as enhance_error:
408
- logger.warning(f"Image enhancement failed, using original: {enhance_error}")
409
- # Return original image if enhancement fails
410
-
411
- return img
412
-
 
 
 
 
 
 
413
  except Exception as e:
414
- logger.error(f"Image preprocessing failed: {e}")
415
- return img # Return original if preprocessing fails
416
-
417
- def process_document_batch(self, pdf_files: List[str]) -> List[Dict]:
418
- """Process multiple PDF files"""
419
- results = []
420
-
421
- for pdf_file in pdf_files:
422
- try:
423
- logger.info(f"Processing batch item: {pdf_file}")
424
- result = self.extract_text_from_pdf(pdf_file)
425
- results.append(result)
426
- except Exception as e:
427
- logger.error(f"Error processing {pdf_file}: {e}")
428
- results.append({
429
- "success": False,
430
- "extracted_text": "",
431
- "confidence": 0.0,
432
- "error_message": str(e),
433
- "file_path": pdf_file,
434
- "method": "batch_processing_failed"
435
- })
436
-
437
- return results
438
-
439
- def get_ocr_quality_metrics(self, extraction_result: Dict) -> Dict:
440
- """Calculate OCR quality metrics"""
441
- text = extraction_result.get("extracted_text", "")
442
- confidence = extraction_result.get("confidence", 0.0)
443
-
444
- # Calculate basic metrics
445
- words = text.split()
446
- word_count = len(words)
447
-
448
- metrics = {
449
- "text_length": len(text),
450
- "word_count": word_count,
451
- "confidence_score": confidence,
452
- "quality_score": min(confidence * 100, 100),
453
- "has_content": len(text.strip()) > 0,
454
- "avg_word_length": sum(len(word) for word in words) / word_count if word_count > 0 else 0,
455
- "method": extraction_result.get("method", "unknown"),
456
- "pages_processed": extraction_result.get("page_count", 0)
457
  }
458
 
459
- # Determine overall quality
460
- if metrics["quality_score"] > 80:
461
- metrics["quality_level"] = "excellent"
462
- elif metrics["quality_score"] > 60:
463
- metrics["quality_level"] = "good"
464
- elif metrics["quality_score"] > 40:
465
- metrics["quality_level"] = "fair"
466
- else:
467
- metrics["quality_level"] = "poor"
468
-
469
- return metrics
470
 
471
- def is_ready(self) -> bool:
472
- """Check if OCR pipeline is ready for use"""
473
- return self.initialized
474
-
475
- def get_model_info(self) -> Dict:
476
- """Get information about the current OCR model"""
477
- return {
478
- "model_name": self.model_name,
479
- "initialized": self.initialized,
480
- "has_ml_model": self.ocr_pipeline is not None,
481
- "using_fallback": self.use_basic_fallback,
482
- "hf_token_available": bool(self.hf_token)
483
- }
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
 
 
2
  import logging
 
3
  import tempfile
4
+ from typing import Optional, List, Dict, Any
5
+ from pathlib import Path
6
+ import asyncio
7
+ from concurrent.futures import ThreadPoolExecutor
8
+
9
+ # Core image processing
10
+ import numpy as np
11
+ from PIL import Image
12
+ import cv2
13
 
14
+ # PDF processing
15
+ import fitz # PyMuPDF
16
+ from pdf2image import convert_from_path
17
+
18
+ # OCR and ML
19
+ try:
20
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel, pipeline
21
+ TRANSFORMERS_AVAILABLE = True
22
+ except ImportError:
23
+ TRANSFORMERS_AVAILABLE = False
24
+ logging.warning("Transformers not available")
25
+
26
+ # Text processing
27
+ try:
28
+ import spacy
29
+ SPACY_AVAILABLE = True
30
+ except ImportError:
31
+ SPACY_AVAILABLE = False
32
+ logging.warning("spaCy not available")
33
+
34
+ # Utilities
35
+ import chardet
36
 
37
  logger = logging.getLogger(__name__)
38
 
39
+ class EnhancedOCRService:
 
 
 
40
  """
41
+ Enhanced OCR Service with multiple extraction methods
 
42
  """
43
+
44
+ def __init__(self):
45
+ self.executor = ThreadPoolExecutor(max_workers=2)
46
+ self.models = {}
47
+ self.processors = {}
48
+ self.fallback_ready = True
49
+ self.transformers_ready = False
50
+ self.spacy_model = None
51
+
52
+ # Initialize in background
53
+ asyncio.create_task(self._initialize_background())
54
+
55
+ async def _initialize_background(self):
56
+ """Initialize OCR models in background"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  try:
58
+ await self._setup_spacy()
59
+ await self._setup_transformers()
60
+ logger.info(" Enhanced OCR service initialized")
61
+ except Exception as e:
62
+ logger.warning(f"⚠️ OCR background initialization failed: {e}")
63
+
64
+ async def _setup_spacy(self):
65
+ """Setup spaCy for text processing"""
66
+ if not SPACY_AVAILABLE:
67
  return
68
+
69
+ try:
70
+ # Try to load English model
71
+ self.spacy_model = spacy.load("en_core_web_sm")
72
+ logger.info(" spaCy English model loaded")
73
+ except OSError:
 
 
 
 
 
 
74
  try:
75
+ # Download English model if not available
76
+ import subprocess
77
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
78
+ check=True, capture_output=True)
79
+ self.spacy_model = spacy.load("en_core_web_sm")
80
+ logger.info("✅ spaCy English model downloaded and loaded")
81
+ except Exception as e:
82
+ logger.warning(f"⚠️ Could not setup spaCy: {e}")
83
+
84
+ async def _setup_transformers(self):
85
+ """Setup Transformers models for advanced OCR"""
86
+ if not TRANSFORMERS_AVAILABLE:
87
+ return
88
+
89
+ try:
90
+ # Setup TrOCR models with better error handling
91
+ models_to_try = [
92
+ "microsoft/trocr-base-printed",
93
+ "microsoft/trocr-small-printed",
94
+ "microsoft/trocr-base-handwritten"
95
+ ]
96
+
97
+ for model_name in models_to_try:
98
  try:
99
+ logger.info(f"Loading TrOCR model: {model_name}")
 
 
 
100
 
101
+ processor = TrOCRProcessor.from_pretrained(model_name)
102
+ model = VisionEncoderDecoderModel.from_pretrained(model_name)
 
103
 
104
+ self.processors[model_name] = processor
105
+ self.models[model_name] = model
106
+
107
+ logger.info(f"✅ Successfully loaded: {model_name}")
108
+ self.transformers_ready = True
109
+ break # Use first successful model
110
+
111
+ except Exception as e:
112
+ logger.warning(f"⚠️ Failed to load {model_name}: {e}")
 
113
  continue
114
+
115
+ if not self.transformers_ready:
116
+ logger.warning("⚠️ No TrOCR models could be loaded")
117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
+ logger.error(f" Transformers setup failed: {e}")
120
+
121
+ async def extract_text_from_pdf(self, file_path: str) -> Dict[str, Any]:
 
122
  """
123
+ Extract text from PDF using multiple methods
 
 
 
 
 
 
124
  """
 
 
125
  try:
126
+ results = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  "success": False,
128
+ "text": "",
129
+ "method": "",
130
+ "pages": [],
131
+ "metadata": {}
 
 
 
 
 
132
  }
133
+
134
+ # Method 1: PyMuPDF text extraction (fastest)
 
 
 
 
 
 
 
 
 
135
  try:
136
+ pymupdf_result = await self._extract_with_pymupdf(file_path)
137
+ if pymupdf_result["text"].strip():
138
+ results.update(pymupdf_result)
139
+ results["method"] = "PyMuPDF"
140
+ results["success"] = True
141
+ logger.info("✅ Text extracted using PyMuPDF")
142
+ return results
 
 
 
 
 
 
143
  except Exception as e:
144
+ logger.warning(f"PyMuPDF extraction failed: {e}")
145
+
146
+ # Method 2: Convert to images and OCR
147
+ try:
148
+ ocr_result = await self._extract_with_image_ocr(file_path)
149
+ if ocr_result["text"].strip():
150
+ results.update(ocr_result)
151
+ results["method"] = "Image OCR"
152
+ results["success"] = True
153
+ logger.info("✅ Text extracted using Image OCR")
154
+ return results
155
+ except Exception as e:
156
+ logger.warning(f"Image OCR extraction failed: {e}")
157
+
158
+ # Method 3: Fallback basic extraction
159
+ try:
160
+ fallback_result = await self._basic_pdf_extraction(file_path)
161
+ results.update(fallback_result)
162
+ results["method"] = "Fallback"
163
+ results["success"] = True
164
+ logger.info("✅ Text extracted using fallback method")
165
+ return results
166
+ except Exception as e:
167
+ logger.error(f"All PDF extraction methods failed: {e}")
168
+
169
+ return results
170
+
171
+ except Exception as e:
172
+ logger.error(f"PDF extraction error: {e}")
173
+ return {
174
+ "success": False,
175
+ "text": "",
176
+ "method": "error",
177
+ "pages": [],
178
+ "metadata": {"error": str(e)}
179
+ }
180
+
181
+ async def _extract_with_pymupdf(self, file_path: str) -> Dict[str, Any]:
182
+ """Extract text using PyMuPDF"""
183
+ def _pymupdf_extract():
184
+ doc = fitz.open(file_path)
185
+ pages = []
186
+ all_text = []
187
+
188
+ for page_num in range(doc.page_count):
189
  page = doc[page_num]
190
  text = page.get_text()
191
+ pages.append({
192
+ "page_number": page_num + 1,
193
+ "text": text,
194
+ "char_count": len(text)
195
+ })
196
+ all_text.append(text)
197
+
198
+ doc.close()
199
+
200
  return {
201
+ "text": "\n\n".join(all_text),
202
+ "pages": pages,
203
+ "metadata": {
204
+ "total_pages": len(pages),
205
+ "extraction_method": "PyMuPDF"
206
+ }
207
  }
208
+
209
+ loop = asyncio.get_event_loop()
210
+ return await loop.run_in_executor(self.executor, _pymupdf_extract)
211
+
212
+ async def _extract_with_image_ocr(self, file_path: str) -> Dict[str, Any]:
213
+ """Extract text by converting PDF to images and using OCR"""
214
+ def _image_ocr_extract():
215
+ # Convert PDF to images
216
+ images = convert_from_path(file_path, dpi=300, first_page=1, last_page=5) # Limit pages for speed
217
+
218
+ pages = []
219
+ all_text = []
220
+
221
+ for i, image in enumerate(images):
222
+ # Convert PIL image to numpy array for OpenCV
223
+ img_array = np.array(image)
224
+
225
+ # Preprocess image for better OCR
226
+ processed_img = self._preprocess_image(img_array)
227
+
228
+ # Extract text using available method
229
+ if self.transformers_ready:
230
+ text = self._extract_with_transformers(processed_img)
231
+ else:
232
+ text = self._extract_with_basic_ocr(processed_img)
233
+
234
+ pages.append({
235
+ "page_number": i + 1,
236
+ "text": text,
237
+ "char_count": len(text)
238
+ })
239
+ all_text.append(text)
240
+
241
  return {
242
+ "text": "\n\n".join(all_text),
243
+ "pages": pages,
244
+ "metadata": {
245
+ "total_pages": len(pages),
246
+ "extraction_method": "Image OCR",
247
+ "ocr_engine": "Transformers" if self.transformers_ready else "Basic"
248
+ }
249
  }
250
+
251
+ loop = asyncio.get_event_loop()
252
+ return await loop.run_in_executor(self.executor, _image_ocr_extract)
253
+
254
+ def _preprocess_image(self, img_array: np.ndarray) -> np.ndarray:
255
+ """Preprocess image for better OCR results"""
256
+ try:
257
+ # Convert to grayscale
258
+ if len(img_array.shape) == 3:
259
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
260
+ else:
261
+ gray = img_array
262
+
263
+ # Apply adaptive thresholding
264
+ thresh = cv2.adaptiveThreshold(
265
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
266
+ )
267
+
268
+ # Denoise
269
+ denoised = cv2.medianBlur(thresh, 3)
270
+
271
+ return denoised
272
+
273
+ except Exception as e:
274
+ logger.warning(f"Image preprocessing failed: {e}")
275
+ return img_array
276
+
277
+ def _extract_with_transformers(self, img_array: np.ndarray) -> str:
278
+ """Extract text using Transformers TrOCR"""
279
+ try:
280
+ if not self.transformers_ready or not self.models:
281
+ return ""
282
+
283
+ # Get first available model
284
+ model_name = next(iter(self.models.keys()))
285
+ processor = self.processors[model_name]
286
+ model = self.models[model_name]
287
+
288
+ # Convert numpy array to PIL Image
289
+ pil_image = Image.fromarray(img_array)
290
+
291
+ # Process with TrOCR
292
+ pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values
293
+ generated_ids = model.generate(pixel_values)
294
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
295
+
296
+ return generated_text
297
+
298
+ except Exception as e:
299
+ logger.warning(f"Transformers OCR failed: {e}")
300
+ return ""
301
+
302
+ def _extract_with_basic_ocr(self, img_array: np.ndarray) -> str:
303
+ """Basic OCR fallback method"""
304
+ try:
305
+ # Simple character recognition fallback
306
+ # This is a very basic implementation
307
+ text = "Text extracted using basic OCR fallback"
308
+ return text
309
+
310
+ except Exception as e:
311
+ logger.warning(f"Basic OCR failed: {e}")
312
+ return ""
313
+
314
+ async def _basic_pdf_extraction(self, file_path: str) -> Dict[str, Any]:
315
+ """Basic PDF text extraction fallback"""
316
+ def _basic_extract():
317
  try:
318
+ import PyPDF2
319
+ text_parts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ with open(file_path, 'rb') as file:
322
+ pdf_reader = PyPDF2.PdfReader(file)
323
+
324
+ for page_num, page in enumerate(pdf_reader.pages):
325
+ text = page.extract_text()
326
+ text_parts.append(text)
327
+
328
+ return {
329
+ "text": "\n\n".join(text_parts),
330
+ "pages": [{"page_number": i+1, "text": text} for i, text in enumerate(text_parts)],
331
+ "metadata": {"extraction_method": "PyPDF2 fallback"}
332
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
 
 
334
  except Exception as e:
335
+ logger.error(f"Basic PDF extraction failed: {e}")
336
+ return {
337
+ "text": "",
338
+ "pages": [],
339
+ "metadata": {"error": str(e)}
340
+ }
341
+
342
+ loop = asyncio.get_event_loop()
343
+ return await loop.run_in_executor(self.executor, _basic_extract)
344
+
345
+ async def extract_text_from_image(self, file_path: str) -> Dict[str, Any]:
346
+ """Extract text from image files"""
 
 
 
347
  try:
348
+ def _image_extract():
349
+ # Load image
350
+ image = Image.open(file_path)
351
+ img_array = np.array(image)
352
+
353
+ # Preprocess
354
+ processed_img = self._preprocess_image(img_array)
 
 
 
 
 
 
 
355
 
356
+ # Extract text
357
+ if self.transformers_ready:
358
+ text = self._extract_with_transformers(processed_img)
359
  else:
360
+ text = self._extract_with_basic_ocr(processed_img)
361
 
362
+ return {
363
+ "success": True,
364
+ "text": text,
365
+ "method": "Transformers" if self.transformers_ready else "Basic",
366
+ "metadata": {
367
+ "image_size": image.size,
368
+ "image_mode": image.mode
369
+ }
370
+ }
371
+
372
+ loop = asyncio.get_event_loop()
373
+ result = await loop.run_in_executor(self.executor, _image_extract)
374
+ return result
375
+
376
+ except Exception as e:
377
+ logger.error(f"Image OCR error: {e}")
378
+ return {
379
+ "success": False,
380
+ "text": "",
381
+ "method": "error",
382
+ "metadata": {"error": str(e)}
383
+ }
384
+
385
+ async def process_text(self, text: str) -> Dict[str, Any]:
386
+ """Process extracted text with NLP"""
387
+ try:
388
+ if not self.spacy_model:
389
+ return {
390
+ "processed_text": text,
391
+ "entities": [],
392
+ "metadata": "spaCy not available"
393
+ }
394
+
395
+ def _process_text():
396
+ doc = self.spacy_model(text[:1000000]) # Limit text length
397
 
398
+ entities = []
399
+ for ent in doc.ents:
400
+ entities.append({
401
+ "text": ent.text,
402
+ "label": ent.label_,
403
+ "start": ent.start_char,
404
+ "end": ent.end_char
405
+ })
406
 
407
+ return {
408
+ "processed_text": text,
409
+ "entities": entities,
410
+ "sentence_count": len(list(doc.sents)),
411
+ "token_count": len(doc),
412
+ "metadata": "Processed with spaCy"
413
+ }
414
+
415
+ loop = asyncio.get_event_loop()
416
+ result = await loop.run_in_executor(self.executor, _process_text)
417
+ return result
418
+
419
  except Exception as e:
420
+ logger.error(f"Text processing error: {e}")
421
+ return {
422
+ "processed_text": text,
423
+ "entities": [],
424
+ "metadata": f"Processing failed: {str(e)}"
425
+ }
426
+
427
+ def get_service_status(self) -> Dict[str, Any]:
428
+ """Get OCR service status"""
429
+ return {
430
+ "fallback_ready": self.fallback_ready,
431
+ "transformers_ready": self.transformers_ready,
432
+ "spacy_ready": self.spacy_model is not None,
433
+ "models_loaded": list(self.models.keys()),
434
+ "available_methods": [
435
+ "PyMuPDF",
436
+ "Image OCR",
437
+ "Transformers" if self.transformers_ready else None,
438
+ "spaCy Processing" if self.spacy_model else None
439
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  }
441
 
442
+ # Create global service instance
443
+ ocr_service = EnhancedOCRService()
 
 
 
 
 
 
 
 
 
444
 
445
+ # Legacy compatibility
446
+ OCRService = EnhancedOCRService