Yaz Hobooti commited on
Commit
7c19dd8
Β·
1 Parent(s): 4e0d8d0

chore: push current local state (diff guards)

Browse files
Files changed (1) hide show
  1. pdf_comparator_backup.py +1375 -0
pdf_comparator_backup.py ADDED
@@ -0,0 +1,1375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio PDF Comparison Tool
4
+ Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
5
+ """
6
+
7
+ import os, sys, re, csv, json, io
8
+ from dataclasses import dataclass
9
+ from typing import List, Tuple, Optional, Iterable
10
+ import tempfile
11
+ import unicodedata
12
+
13
+ import numpy as np
14
+ from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
15
+ from pdf2image import convert_from_path
16
+ from skimage.measure import label, regionprops
17
+ from skimage.morphology import dilation, rectangle
18
+ import gradio as gr
19
+
20
+ # Alternative PDF processing
21
+ try:
22
+ import fitz # PyMuPDF
23
+ HAS_PYMUPDF = True
24
+ except Exception:
25
+ fitz = None
26
+ HAS_PYMUPDF = False
27
+
28
+ # Optional features
29
+ try:
30
+ import pytesseract
31
+ HAS_OCR = True
32
+ except Exception:
33
+ pytesseract = None
34
+ HAS_OCR = False
35
+
36
+ try:
37
+ from spellchecker import SpellChecker
38
+ HAS_SPELLCHECK = True
39
+ except Exception:
40
+ SpellChecker = None
41
+ HAS_SPELLCHECK = False
42
+
43
+ try:
44
+ import regex as re
45
+ HAS_REGEX = True
46
+ except Exception:
47
+ import re
48
+ HAS_REGEX = False
49
+
50
+ try:
51
+ from pyzbar.pyzbar import decode as zbar_decode
52
+ HAS_BARCODE = True
53
+ except Exception:
54
+ zbar_decode = None
55
+ HAS_BARCODE = False
56
+
57
+ # -------------------- Core Data --------------------
58
+ @dataclass
59
+ class Box:
60
+ y1: int; x1: int; y2: int; x2: int; area: int
61
+
62
+ # ---- spell/tokenization helpers & caches ----
63
+ if HAS_REGEX:
64
+ # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
65
+ _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
66
+ else:
67
+ # Fallback regex for basic ASCII
68
+ _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
69
+
70
+ if HAS_SPELLCHECK:
71
+ # Initialize English spell checker with comprehensive dictionary
72
+ _SPELL_EN = SpellChecker(language="en")
73
+
74
+ # Try to initialize French spell checker with fallback
75
+ _SPELL_FR = None
76
+ try:
77
+ _SPELL_FR = SpellChecker(language="fr")
78
+ except Exception:
79
+ # If French dictionary fails, try alternative approach
80
+ try:
81
+ _SPELL_FR = SpellChecker()
82
+ # Load some basic French words manually if needed
83
+ except Exception:
84
+ _SPELL_FR = None
85
+ print("Warning: French spell checker not available")
86
+ else:
87
+ _SPELL_EN = None
88
+ _SPELL_FR = None
89
+
90
+ _DOMAIN_ALLOWLIST = {
91
+ # Company/Brand names
92
+ "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
93
+ "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid",
94
+
95
+ # Technical terms
96
+ "CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP",
97
+ "Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure",
98
+ "Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock",
99
+
100
+ # Common abbreviations
101
+ "Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd",
102
+ "USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU",
103
+
104
+ # French words (common in Canadian context)
105
+ "QuΓ©bec", "MontrΓ©al", "Toronto", "Vancouver", "Ottawa", "Calgary",
106
+ "franΓ§ais", "franΓ§aise", "anglais", "anglaise", "bilingue",
107
+
108
+ # Common business terms
109
+ "Marketing", "Sales", "Customer", "Service", "Quality", "Control",
110
+ "Management", "Administration", "Production", "Manufacturing",
111
+ "Distribution", "Logistics", "Supply", "Chain", "Inventory",
112
+
113
+ # Common words that might be flagged
114
+ "Email", "Website", "Online", "Internet", "Software", "Hardware",
115
+ "Database", "System", "Network", "Server", "Client", "User",
116
+ "Password", "Login", "Logout", "Account", "Profile", "Settings",
117
+ "Configuration", "Installation", "Maintenance", "Support",
118
+
119
+ # Numbers and measurements
120
+ "mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in",
121
+ "x", "by", "times", "multiply", "divide", "plus", "minus",
122
+
123
+ # Common misspellings that are actually correct in context
124
+ "colour", "colour", "favour", "favour", "honour", "honour",
125
+ "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
126
+ "theatre", "theatre", "metre", "metre", "litre", "litre",
127
+
128
+ # Pharmaceutical terms
129
+ "glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate",
130
+ "ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
131
+ "stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
132
+ "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
133
+ "hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
134
+ "phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
135
+ "sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
136
+ "copper", "manganese", "selenium", "chromium", "molybdenum",
137
+ "thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
138
+ "biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
139
+ "phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
140
+ "creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
141
+ "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
142
+ "cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
143
+ "asparagine", "glutamic", "aspartic", "alanine", "glycine",
144
+ "polysorbate", "monostearate", "distearate", "tristearate",
145
+ "polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
146
+ "cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
147
+ "microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
148
+ "maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
149
+ "stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
150
+ "eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
151
+ "conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
152
+ "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
153
+ "phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
154
+ "malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
155
+ "palmitate", "stearate", "oleate", "linoleate", "arachidonate"
156
+ }
157
+ _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
158
+
159
+ if _SPELL_EN:
160
+ _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
161
+ if _SPELL_FR:
162
+ _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
163
+
164
+ def _normalize_text(s: str) -> str:
165
+ """Normalize text for better word extraction"""
166
+ if not s:
167
+ return ""
168
+
169
+ # Unicode normalization
170
+ s = unicodedata.normalize("NFC", s)
171
+
172
+ # Fix common apostrophe issues
173
+ s = s.replace("'", "'").replace("'", "'")
174
+
175
+ # Normalize whitespace - replace multiple spaces with single space
176
+ s = re.sub(r'\s+', ' ', s)
177
+
178
+ # Remove leading/trailing whitespace
179
+ s = s.strip()
180
+
181
+ return s
182
+
183
+ def _extract_tokens(raw: str):
184
+ """Extract word tokens with improved filtering"""
185
+ s = _normalize_text(raw or "")
186
+ tokens = _WORD_RE.findall(s)
187
+
188
+ # Filter out tokens that are too short or don't look like words
189
+ filtered_tokens = []
190
+ for token in tokens:
191
+ if len(token) >= 2 and _is_likely_word(token):
192
+ filtered_tokens.append(token)
193
+
194
+ return filtered_tokens
195
+
196
+ def _looks_like_acronym(tok: str) -> bool:
197
+ """Check if token looks like a valid acronym"""
198
+ return tok.isupper() and 2 <= len(tok) <= 6
199
+
200
+ def _has_digits(tok: str) -> bool:
201
+ """Check if token contains digits"""
202
+ return any(ch.isdigit() for ch in tok)
203
+
204
+ def _is_mostly_numbers(tok: str) -> bool:
205
+ """Check if token is mostly numbers (should be ignored)"""
206
+ if not tok:
207
+ return False
208
+
209
+ # Count digits and letters
210
+ digit_count = sum(1 for ch in tok if ch.isdigit())
211
+ letter_count = sum(1 for ch in tok if ch.isalpha())
212
+ total_chars = len(tok)
213
+
214
+ # If more than 70% digits, consider it mostly numbers
215
+ if digit_count / total_chars > 0.7:
216
+ return True
217
+
218
+ # If it's a pure number (all digits), ignore it
219
+ if digit_count == total_chars:
220
+ return True
221
+
222
+ # If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
223
+ if total_chars >= 2 and digit_count >= 1:
224
+ suffix = tok[-2:].lower()
225
+ if suffix in ['st', 'nd', 'rd', 'th']:
226
+ return True
227
+
228
+ # If it's a decimal number (contains digits and decimal point)
229
+ if '.' in tok and digit_count > 0:
230
+ return True
231
+
232
+ # If it's a percentage (ends with %)
233
+ if tok.endswith('%') and digit_count > 0:
234
+ return True
235
+
236
+ return False
237
+
238
+ def _is_likely_word(tok: str) -> bool:
239
+ """Check if token looks like a real word (not random characters)"""
240
+ if len(tok) < 2:
241
+ return False
242
+
243
+ # Filter out tokens that are mostly non-letter characters
244
+ letter_count = sum(1 for c in tok if c.isalpha())
245
+ if letter_count < len(tok) * 0.6: # At least 60% letters
246
+ return False
247
+
248
+ # Filter out tokens with too many consecutive consonants/vowels
249
+ vowels = set('aeiouAEIOU')
250
+ consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
251
+
252
+ # Check for excessive consonant clusters (like "qwerty" or "zxcvb")
253
+ if len(tok) >= 4:
254
+ consonant_clusters = 0
255
+ vowel_clusters = 0
256
+ for i in range(len(tok) - 2):
257
+ if tok[i:i+3].lower() in consonants:
258
+ consonant_clusters += 1
259
+ if tok[i:i+3].lower() in vowels:
260
+ vowel_clusters += 1
261
+
262
+ # If more than half the possible clusters are consonant clusters, likely not a word
263
+ if consonant_clusters > len(tok) * 0.3:
264
+ return False
265
+
266
+ # Filter out tokens that look like random keyboard patterns
267
+ keyboard_patterns = [
268
+ 'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
269
+ 'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
270
+ '123456', '234567', '345678', '456789', '567890'
271
+ ]
272
+
273
+ tok_lower = tok.lower()
274
+ for pattern in keyboard_patterns:
275
+ if pattern in tok_lower or tok_lower in pattern:
276
+ return False
277
+
278
+ return True
279
+
280
+ def _is_known_word(tok: str) -> bool:
281
+ """Check if token is a known word with comprehensive filtering"""
282
+ t = tok.lower()
283
+
284
+ # First check if it looks like a real word
285
+ if not _is_likely_word(tok):
286
+ return True # Don't flag non-words as misspellings
287
+
288
+ # Ignore numbers and mostly numeric tokens
289
+ if _is_mostly_numbers(tok):
290
+ return True # Don't flag numbers as misspellings
291
+
292
+ # Check domain allowlist, acronyms, and words with digits
293
+ if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
294
+ return True
295
+
296
+ # Check hyphenated words - if any part is known, consider the whole word known
297
+ if '-' in tok:
298
+ parts = tok.split('-')
299
+ if all(_is_known_word(part) for part in parts):
300
+ return True
301
+
302
+ # Check against English spell checker
303
+ if _SPELL_EN:
304
+ try:
305
+ # Check if word is known in English dictionary
306
+ if not _SPELL_EN.unknown([t]):
307
+ return True
308
+ except Exception:
309
+ pass
310
+
311
+ # Check against French spell checker
312
+ if _SPELL_FR:
313
+ try:
314
+ # Check if word is known in French dictionary
315
+ if not _SPELL_FR.unknown([t]):
316
+ return True
317
+ except Exception:
318
+ pass
319
+
320
+ # Additional checks for common patterns
321
+ # Check for common suffixes/prefixes that might not be in dictionaries
322
+ common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible']
323
+ common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down']
324
+
325
+ # Check if word with common suffix/prefix is known
326
+ for suffix in common_suffixes:
327
+ if t.endswith(suffix) and len(t) > len(suffix) + 2:
328
+ base_word = t[:-len(suffix)]
329
+ if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
330
+ return True
331
+
332
+ for prefix in common_prefixes:
333
+ if t.startswith(prefix) and len(t) > len(prefix) + 2:
334
+ base_word = t[len(prefix):]
335
+ if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
336
+ return True
337
+
338
+ # Check for plural forms (simple 's' ending)
339
+ if t.endswith('s') and len(t) > 3:
340
+ singular = t[:-1]
341
+ if _SPELL_EN and not _SPELL_EN.unknown([singular]):
342
+ return True
343
+
344
+ return False
345
+
346
+ # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
347
+ def normalize_token(token: str) -> str:
348
+ toks = _extract_tokens(token)
349
+ return (toks[0].lower() if toks else "")
350
+
351
+ # -------------------- Helpers ----------------------
352
+ def _is_pdf(path: str) -> bool:
353
+ return os.path.splitext(path.lower())[1] == ".pdf"
354
+
355
+ def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
356
+ """
357
+ Check if a box is in the excluded bottom area (115mm from bottom).
358
+ Converts mm to pixels using DPI.
359
+ """
360
+ # Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
361
+ excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
362
+
363
+ # Calculate the top boundary of the excluded area
364
+ excluded_top = image_height - excluded_height_pixels
365
+
366
+ # Check if the box intersects with the excluded area
367
+ return box.y1 >= excluded_top
368
+
369
+ def _contains_validation_text(text: str) -> bool:
370
+ """Check if text contains the validation text '50 Carroll'"""
371
+ return "50 Carroll" in text
372
+
373
+ def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]:
374
+ """Load PDF pages as images with fallback options"""
375
+ if not _is_pdf(path):
376
+ return [Image.open(path).convert("RGB")]
377
+
378
+ # Try pdf2image first
379
+ poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
380
+
381
+ for poppler_path in poppler_paths:
382
+ try:
383
+ if poppler_path:
384
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
385
+ else:
386
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
387
+
388
+ if imgs:
389
+ return [img.convert("RGB") for img in imgs]
390
+ except Exception:
391
+ if poppler_path is None: # All pdf2image attempts failed
392
+ break
393
+ continue # Try next path
394
+
395
+ # Fallback to PyMuPDF
396
+ if HAS_PYMUPDF:
397
+ try:
398
+ doc = fitz.open(path)
399
+ pages = []
400
+ for page_num in range(min(len(doc), max_pages)):
401
+ page = doc[page_num]
402
+ mat = fitz.Matrix(dpi/72, dpi/72)
403
+ pix = page.get_pixmap(matrix=mat)
404
+ img_data = pix.tobytes("ppm")
405
+ img = Image.open(io.BytesIO(img_data))
406
+ pages.append(img.convert("RGB"))
407
+ doc.close()
408
+ return pages
409
+ except Exception as e:
410
+ raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
411
+
412
+ raise ValueError("Failed to convert PDF to image. No working method available.")
413
+
414
+ def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
415
+ """Combine multiple pages into a single vertical image"""
416
+ if not pages:
417
+ raise ValueError("No pages to combine")
418
+ if len(pages) == 1:
419
+ return pages[0]
420
+
421
+ # Find the maximum width
422
+ max_width = max(page.width for page in pages)
423
+
424
+ # Calculate total height
425
+ total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
426
+
427
+ # Create combined image
428
+ combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
429
+
430
+ y_offset = 0
431
+ for page in pages:
432
+ # Center the page horizontally if it's narrower than max_width
433
+ x_offset = (max_width - page.width) // 2
434
+ combined.paste(page, (x_offset, y_offset))
435
+ y_offset += page.height + spacing
436
+
437
+ return combined
438
+
439
+ def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
440
+ if a.size == b.size:
441
+ return a, b
442
+ w, h = min(a.width, b.width), min(a.height, b.height)
443
+ return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
444
+
445
+ def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
446
+ return ImageChops.difference(a, b)
447
+
448
+ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
449
+ arr = np.asarray(diff_img).astype(np.uint16)
450
+ gray = arr.max(axis=2).astype(np.uint8)
451
+ mask = (gray >= threshold).astype(np.uint8)
452
+ mask = dilation(mask, rectangle(3, 3))
453
+ labeled = label(mask, connectivity=2)
454
+ out: List[Box] = []
455
+ img_height = diff_img.height
456
+
457
+ for p in regionprops(labeled):
458
+ if p.area < min_area:
459
+ continue
460
+ minr, minc, maxr, maxc = p.bbox
461
+ box = Box(minr, minc, maxr, maxc, int(p.area))
462
+
463
+ # Skip boxes in the excluded bottom area
464
+ if _is_in_excluded_bottom_area(box, img_height):
465
+ continue
466
+
467
+ out.append(box)
468
+ return out
469
+
470
+ def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
471
+ width: int = 3) -> Image.Image:
472
+ out = img.copy(); d = ImageDraw.Draw(out)
473
+ # red (diff)
474
+ for b in red_boxes:
475
+ for w in range(width):
476
+ d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
477
+ # cyan (misspellings)
478
+ for b in cyan_boxes:
479
+ for w in range(width):
480
+ d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
481
+ # green (barcodes)
482
+ if green_boxes:
483
+ for b in green_boxes:
484
+ for w in range(width):
485
+ d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
486
+ return out
487
+
488
+ def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
489
+ A = np.asarray(a).copy(); B = np.asarray(b)
490
+ mask = np.any(A != B, axis=2)
491
+ A[mask] = [255, 0, 0]
492
+ return Image.fromarray(A)
493
+
494
+ # -------------------- OCR + Spellcheck -------------
495
+ from typing import List, Iterable, Optional
496
+ from PIL import Image
497
+ import unicodedata
498
+ import regex as re
499
+ import pytesseract
500
+ from spellchecker import SpellChecker
501
+
502
+ # If these existed in your file, keep them; otherwise define defaults to avoid NameError
503
+ try:
504
+ HAS_OCR
505
+ except NameError:
506
+ HAS_OCR = True
507
+ try:
508
+ HAS_SPELLCHECK
509
+ except NameError:
510
+ HAS_SPELLCHECK = True
511
+
512
+ # ---- spell/tokenization helpers & caches ----
513
+ _WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)
514
+
515
+ _SPELL_EN = SpellChecker(language="en")
516
+ _SPELL_FR = SpellChecker(language="fr")
517
+
518
+ _DOMAIN_ALLOWLIST = {
519
+ "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
520
+ "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
521
+ }
522
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
523
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
524
+
525
+ def _normalize_text(s: str) -> str:
526
+ s = unicodedata.normalize("NFC", s)
527
+ return s.replace("'", "'").strip()
528
+
529
+ def _looks_like_acronym(tok: str) -> bool:
530
+ return tok.isupper() and 2 <= len(tok) <= 6
531
+
532
+ def _has_digits(tok: str) -> bool:
533
+ return any(ch.isdigit() for ch in tok)
534
+
535
+ # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
536
+ def normalize_token(token: str) -> str:
537
+ toks = _extract_tokens(token)
538
+ return (toks[0].lower() if toks else "")
539
+
540
+ def _get_available_tesseract_langs():
541
+ """Get available Tesseract languages"""
542
+ try:
543
+ langs = pytesseract.get_languages()
544
+ if 'eng' in langs and 'fra' in langs:
545
+ return "eng+fra"
546
+ elif 'eng' in langs:
547
+ return "eng"
548
+ elif langs:
549
+ return langs[0]
550
+ else:
551
+ return "eng"
552
+ except Exception:
553
+ return "eng"
554
+
555
+ def prepare_for_ocr(img: Image.Image) -> Image.Image:
556
+ """Prepare image for better OCR results"""
557
+ from PIL import ImageOps, ImageFilter
558
+ g = img.convert("L")
559
+ g = ImageOps.autocontrast(g)
560
+ g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
561
+ return g
562
+
563
+ def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
564
+ """Extract text directly from PDF using PyMuPDF"""
565
+ if not HAS_PYMUPDF:
566
+ return []
567
+
568
+ try:
569
+ doc = fitz.open(path)
570
+ texts = []
571
+ for page_num in range(min(len(doc), max_pages)):
572
+ page = doc[page_num]
573
+ text = page.get_text()
574
+ texts.append(text)
575
+ doc.close()
576
+ return texts
577
+ except Exception:
578
+ return []
579
+
580
+ def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
581
+ """Convert PDF coordinates to image coordinates"""
582
+ pdf_width, pdf_height = pdf_page_size
583
+ img_width, img_height = image_size
584
+
585
+ # Scale factors
586
+ scale_x = img_width / pdf_width
587
+ scale_y = img_height / pdf_height
588
+
589
+ # Convert PDF coordinates to image coordinates
590
+ x1 = int(pdf_bbox[0] * scale_x)
591
+ y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
592
+ x2 = int(pdf_bbox[2] * scale_x)
593
+ y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
594
+
595
+ return x1, y1, x2, y2
596
+
597
+ def find_misspell_boxes_from_text(
598
+ pdf_path: str,
599
+ *,
600
+ extra_allow: Optional[Iterable[str]] = None,
601
+ max_pages: int = 5,
602
+ image_size: Optional[Tuple[int, int]] = None
603
+ ) -> List[Box]:
604
+ """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
605
+ if not (HAS_SPELLCHECK and HAS_PYMUPDF):
606
+ return []
607
+
608
+ # Load extra allowed words
609
+ if extra_allow and _SPELL_EN:
610
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
611
+ if extra_allow and _SPELL_FR:
612
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
613
+
614
+ boxes: List[Box] = []
615
+
616
+ try:
617
+ doc = fitz.open(pdf_path)
618
+
619
+ for page_num in range(min(len(doc), max_pages)):
620
+ page = doc[page_num]
621
+
622
+ # Get text with position information
623
+ text_dict = page.get_text("dict")
624
+
625
+ # Process each block of text
626
+ for block in text_dict.get("blocks", []):
627
+ if "lines" not in block:
628
+ continue
629
+
630
+ for line in block["lines"]:
631
+ for span in line["spans"]:
632
+ text = span.get("text", "").strip()
633
+ if not text:
634
+ continue
635
+
636
+ # Extract tokens and check for misspellings
637
+ tokens = _extract_tokens(text)
638
+ has_misspelling = False
639
+
640
+ for token in tokens:
641
+ if len(token) >= 2 and not _is_known_word(token):
642
+ has_misspelling = True
643
+ break
644
+
645
+ # If this span has misspellings, create a box for it
646
+ if has_misspelling:
647
+ bbox = span["bbox"] # [x0, y0, x1, y1]
648
+
649
+ # Get page dimensions for coordinate conversion
650
+ page_rect = page.rect
651
+ pdf_width = page_rect.width
652
+ pdf_height = page_rect.height
653
+
654
+ # Calculate coordinates
655
+ if image_size:
656
+ img_width, img_height = image_size
657
+ # Convert PDF coordinates to image coordinates
658
+ scale_x = img_width / pdf_width
659
+ scale_y = img_height / pdf_height
660
+ x1 = int(bbox[0] * scale_x)
661
+ y1 = int(bbox[1] * scale_y) + (page_num * img_height)
662
+ x2 = int(bbox[2] * scale_x)
663
+ y2 = int(bbox[3] * scale_y) + (page_num * img_height)
664
+ else:
665
+ x1 = int(bbox[0])
666
+ y1 = int(bbox[1]) + (page_num * 1000)
667
+ x2 = int(bbox[2])
668
+ y2 = int(bbox[3]) + (page_num * 1000)
669
+
670
+ # Create box
671
+ box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1))
672
+
673
+ # Skip boxes in excluded bottom area unless they contain validation text
674
+ if image_size:
675
+ img_height = image_size[1]
676
+ if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
677
+ continue
678
+
679
+ boxes.append(box)
680
+
681
+ doc.close()
682
+
683
+ except Exception:
684
+ # Fallback to simple text extraction if coordinate mapping fails
685
+ page_texts = extract_pdf_text(pdf_path, max_pages)
686
+ for page_num, text in enumerate(page_texts):
687
+ if not text.strip():
688
+ continue
689
+
690
+ tokens = _extract_tokens(text)
691
+ misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
692
+
693
+ if misspelled_words:
694
+ # Create a placeholder box for the page
695
+ boxes.append(Box(
696
+ y1=page_num * 1000,
697
+ x1=0,
698
+ y2=(page_num + 1) * 1000,
699
+ x2=800,
700
+ area=800 * 1000
701
+ ))
702
+
703
+ return boxes
704
+
705
+ def find_misspell_boxes(
706
+ img: Image.Image,
707
+ *,
708
+ min_conf: int = 60,
709
+ lang: Optional[str] = None,
710
+ extra_allow: Optional[Iterable[str]] = None,
711
+ dpi: int = 300,
712
+ psm: int = 6,
713
+ oem: int = 3
714
+ ) -> List[Box]:
715
+ """Legacy OCR-based spell checking (kept for fallback)"""
716
+ if not (HAS_OCR and HAS_SPELLCHECK):
717
+ return []
718
+
719
+ # Auto-detect language if not provided
720
+ if lang is None:
721
+ try:
722
+ avail = set(pytesseract.get_languages(config="") or [])
723
+ except Exception:
724
+ avail = {"eng"}
725
+ lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
726
+
727
+ # OPTIONAL: light upscale if the image is small (heuristic)
728
+ # target width ~ 2500–3000 px for letter-sized pages
729
+ if img.width < 1600:
730
+ scale = 2
731
+ img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
732
+
733
+ # Prepare image for better OCR
734
+ img = prepare_for_ocr(img)
735
+
736
+ try:
737
+ if extra_allow and _SPELL_EN:
738
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
739
+ if extra_allow and _SPELL_FR:
740
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
741
+
742
+ # Build a config that sets an explicit DPI and keeps spaces
743
+ config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
744
+
745
+ data = pytesseract.image_to_data(
746
+ img,
747
+ lang=lang,
748
+ config=config,
749
+ output_type=pytesseract.Output.DICT,
750
+ )
751
+ except Exception:
752
+ return []
753
+
754
+ n = len(data.get("text", [])) or 0
755
+ boxes: List[Box] = []
756
+
757
+ for i in range(n):
758
+ raw = data["text"][i]
759
+ if not raw:
760
+ continue
761
+
762
+ # confidence filter
763
+ conf_str = data.get("conf", ["-1"])[i]
764
+ try:
765
+ conf = int(float(conf_str))
766
+ except Exception:
767
+ conf = -1
768
+ if conf < min_conf:
769
+ continue
770
+
771
+ tokens = _extract_tokens(raw)
772
+ if not tokens:
773
+ continue
774
+
775
+ # flag the box if ANY token in it looks misspelled
776
+ if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
777
+ continue
778
+
779
+ left = data.get("left", [0])[i]
780
+ top = data.get("top", [0])[i]
781
+ width = data.get("width", [0])[i]
782
+ height = data.get("height",[0])[i]
783
+ if width <= 0 or height <= 0:
784
+ continue
785
+
786
+ # NOTE: adjust to match your Box constructor if needed
787
+ b = Box(top, left, top + height, left + width, width * height)
788
+ # Exclude bottom 115mm unless the text contains the validation phrase
789
+ if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw):
790
+ continue
791
+ boxes.append(b)
792
+
793
+ return boxes
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+ # deps: pip install zxing-cpp pyzbar pylibdmtx PyMuPDF pillow opencv-python-headless regex
803
+ # system: macOS -> brew install zbar poppler ; Ubuntu -> sudo apt-get install libzbar0 poppler-utils
804
+
805
+ import io, regex as re
806
+ from typing import List, Tuple, Dict, Any
807
+ from PIL import Image, ImageOps
808
+ import numpy as np
809
+
810
+ import fitz # PyMuPDF
811
+
812
+ # Optional backends
813
+ try:
814
+ import zxingcpp; HAS_ZXING=True
815
+ except Exception: HAS_ZXING=False
816
+ try:
817
+ from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True
818
+ except Exception: HAS_ZBAR=False; ZBarSymbol=None
819
+ try:
820
+ from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True
821
+ except Exception: HAS_DMTX=False
822
+ try:
823
+ import cv2; HAS_CV2=True
824
+ except Exception: HAS_CV2=False
825
+
826
+ # your Box(y1,x1,y2,x2,area) assumed to exist
827
+
828
+ def _binarize(img: Image.Image) -> Image.Image:
829
+ g = ImageOps.grayscale(img)
830
+ g = ImageOps.autocontrast(g)
831
+ return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L")
832
+
833
+ def _ean_checksum_ok(d: str) -> bool:
834
+ if not d.isdigit(): return False
835
+ n=len(d); nums=list(map(int,d))
836
+ if n==8:
837
+ return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7]
838
+ if n==12:
839
+ return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11]
840
+ if n==13:
841
+ return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12]
842
+ return True
843
+
844
+ def _normalize_upc_ean(sym: str, text: str):
845
+ digits = re.sub(r"\D","",text or "")
846
+ s = (sym or "").upper()
847
+ if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"):
848
+ return "UPCA", digits[1:]
849
+ return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "")
850
+
851
+ def _validate(sym: str, payload: str) -> bool:
852
+ s, norm = _normalize_upc_ean(sym, payload)
853
+ return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload)
854
+
855
+ def _decode_zxing(pil: Image.Image) -> List[Dict[str,Any]]:
856
+ if not HAS_ZXING: return []
857
+ arr = np.asarray(pil.convert("L"))
858
+ out=[]
859
+ for r in zxingcpp.read_barcodes(arr): # try_harder is default True in recent builds; otherwise supply options
860
+ # zxingcpp.Position may be iterable (sequence of points) or an object with corner attributes
861
+ x1=y1=x2=y2=w=h=0
862
+ pos = getattr(r, "position", None)
863
+ pts: List[Any] = []
864
+ if pos is not None:
865
+ try:
866
+ pts = list(pos) # works if iterable
867
+ except TypeError:
868
+ # Fall back to known corner attribute names across versions
869
+ corner_names = (
870
+ "top_left", "topLeft",
871
+ "top_right", "topRight",
872
+ "bottom_left", "bottomLeft",
873
+ "bottom_right", "bottomRight",
874
+ "point1", "point2", "point3", "point4",
875
+ )
876
+ seen=set()
877
+ for name in corner_names:
878
+ if hasattr(pos, name):
879
+ p = getattr(pos, name)
880
+ # avoid duplicates
881
+ if id(p) not in seen and hasattr(p, "x") and hasattr(p, "y"):
882
+ pts.append(p)
883
+ seen.add(id(p))
884
+ if pts:
885
+ xs=[int(getattr(p, "x", 0)) for p in pts]
886
+ ys=[int(getattr(p, "y", 0)) for p in pts]
887
+ x1,x2=min(xs),max(xs); y1,y2=min(ys),max(ys)
888
+ w,h=x2-x1,y2-y1
889
+ out.append({
890
+ "type": str(r.format),
891
+ "data": r.text or "",
892
+ "left": x1,
893
+ "top": y1,
894
+ "width": w,
895
+ "height": h,
896
+ })
897
+ return out
898
+
899
+ def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
900
+ if not HAS_ZBAR: return []
901
+ syms=[ZBarSymbol.QRCODE,ZBarSymbol.EAN13,ZBarSymbol.EAN8,ZBarSymbol.UPCA,ZBarSymbol.CODE128] if ZBarSymbol else None
902
+ res=zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil)
903
+ return [{"type": d.type, "data": (d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)),
904
+ "left": d.rect.left, "top": d.rect.top, "width": d.rect.width, "height": d.rect.height} for d in res]
905
+
906
+ def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]:
907
+ if not HAS_DMTX: return []
908
+ try:
909
+ res=dmtx_decode(ImageOps.grayscale(pil))
910
+ return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"),
911
+ "left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res]
912
+ except Exception:
913
+ return []
914
+
915
+ def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]:
916
+ if not HAS_CV2: return []
917
+ try:
918
+ det=cv2.QRCodeDetector()
919
+ g=np.asarray(pil.convert("L"))
920
+ val, pts, _ = det.detectAndDecode(g)
921
+ if val:
922
+ if pts is not None and len(pts)>=1:
923
+ pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1]
924
+ x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max())
925
+ w,h=x2-x1,y2-y1
926
+ else:
927
+ x1=y1=w=h=0
928
+ return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}]
929
+ except Exception:
930
+ pass
931
+ return []
932
+
933
+ def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]:
934
+ variants=[pil, ImageOps.grayscale(pil), _binarize(pil)]
935
+ # upsample small images with NEAREST to keep bars crisp
936
+ w,h=pil.size
937
+ if max(w,h)<1600:
938
+ up=pil.resize((w*2,h*2), resample=Image.NEAREST)
939
+ variants += [up, _binarize(up)]
940
+ for v in variants:
941
+ # ZXing first (broad coverage), then ZBar, then DMTX, then cv2 QR
942
+ res = _decode_zxing(v)
943
+ if res: return res
944
+ res = _decode_zbar(v)
945
+ if res: return res
946
+ res = _decode_dmtx(v)
947
+ if res: return res
948
+ res = _decode_cv2_qr(v)
949
+ if res: return res
950
+ # try rotations
951
+ for angle in (90,180,270):
952
+ r=v.rotate(angle, expand=True)
953
+ res = _decode_zxing(r) or _decode_zbar(r) or _decode_dmtx(r) or _decode_cv2_qr(r)
954
+ if res: return res
955
+ return []
956
+
957
+ def _pix_to_pil(pix) -> Image.Image:
958
+ # convert PyMuPDF Pixmap to grayscale PIL without alpha (avoids blur)
959
+ if pix.alpha: pix = fitz.Pixmap(pix, 0)
960
+ try:
961
+ pix = fitz.Pixmap(fitz.csGRAY, pix)
962
+ except Exception:
963
+ pass
964
+ return Image.open(io.BytesIO(pix.tobytes("png")))
965
+
966
+ def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
967
+ """Return (boxes, infos) from both rendered pages and embedded images."""
968
+ boxes=[]; infos=[]
969
+ doc=fitz.open(pdf_path)
970
+ n=min(len(doc), max_pages)
971
+ for page_idx in range(n):
972
+ page=doc[page_idx]
973
+
974
+ # A) Embedded images (often crisp)
975
+ for ix,(xref,*_) in enumerate(page.get_images(full=True)):
976
+ try:
977
+ pix=fitz.Pixmap(doc, xref)
978
+ pil=_pix_to_pil(pix)
979
+ hits=_decode_variants(pil)
980
+ for r in hits:
981
+ b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
982
+ # Exclude barcodes in the bottom 115mm of the page image
983
+ if _is_in_excluded_bottom_area(b, pil.height):
984
+ continue
985
+ boxes.append(b)
986
+ sym, payload = r["type"], r["data"]
987
+ infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
988
+ except Exception:
989
+ pass
990
+
991
+ # B) Render page raster at high DPI (grayscale)
992
+ for dpi in dpi_list:
993
+ scale=dpi/72.0
994
+ try:
995
+ pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False)
996
+ except TypeError:
997
+ pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False)
998
+ pil=_pix_to_pil(pix)
999
+ hits=_decode_variants(pil)
1000
+ for r in hits:
1001
+ b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
1002
+ if _is_in_excluded_bottom_area(b, pil.height):
1003
+ continue
1004
+ boxes.append(b)
1005
+ sym, payload = r["type"], r["data"]
1006
+ infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
1007
+ if any(i["page"]==page_idx+1 for i in infos):
1008
+ break # found something for this page β†’ next page
1009
+ doc.close()
1010
+ return boxes, infos
1011
+
1012
+
1013
+
1014
+
1015
+ # -------------------- CMYK Panel -------------------
1016
+ def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
1017
+ return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255
1018
+
1019
+ def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
1020
+ y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
1021
+ x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
1022
+ if y2<=y1 or x2<=x1:
1023
+ return (0.0,0.0,0.0,0.0)
1024
+ region = cmyk_arr[y1:y2, x1:x2, :]
1025
+ mean_vals = region.reshape(-1, 4).mean(axis=0)
1026
+ return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
1027
+
1028
+ def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
1029
+ a_cmyk = rgb_to_cmyk_array(a_img)
1030
+ b_cmyk = rgb_to_cmyk_array(b_img)
1031
+ entries = []
1032
+ for i, bx in enumerate(red_boxes):
1033
+ a_vals = avg_cmyk_in_box(a_cmyk, bx)
1034
+ b_vals = avg_cmyk_in_box(b_cmyk, bx)
1035
+ delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
1036
+ entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
1037
+ return entries
1038
+
1039
+ def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
1040
+ w,h = base.size
1041
+ panel = Image.new('RGB', (panel_width, h), (245,245,245))
1042
+ out = Image.new('RGB', (w+panel_width, h), (255,255,255))
1043
+ out.paste(base, (0,0)); out.paste(panel, (w,0))
1044
+ d = ImageDraw.Draw(out)
1045
+ x0 = w + 8; y = 8
1046
+ d.text((x0, y), title, fill=(0,0,0)); y += 18
1047
+ if not entries:
1048
+ d.text((x0, y), 'No differing regions', fill=(80,80,80))
1049
+ return out
1050
+ for e in entries:
1051
+ idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
1052
+ d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
1053
+ d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
1054
+ d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
1055
+ d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
1056
+ if y > h - 40: break
1057
+ return out
1058
+
1059
+ # -------------------- Gradio Interface -----------------
1060
+ def compare_pdfs(file_a, file_b):
1061
+ """Main comparison function for Gradio interface"""
1062
+ try:
1063
+ if file_a is None or file_b is None:
1064
+ return None, None, None, "❌ Please upload both PDF files to compare", [], []
1065
+
1066
+ # Load images with multiple pages support
1067
+ pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
1068
+ pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
1069
+
1070
+ # Combine pages into single images for comparison
1071
+ a = combine_pages_vertically(pages_a)
1072
+ b = combine_pages_vertically(pages_b)
1073
+
1074
+ # Match sizes
1075
+ a, b = match_sizes(a, b)
1076
+
1077
+ # Find differences with default settings
1078
+ diff = difference_map(a, b)
1079
+ red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
1080
+
1081
+ # Run all analysis features with defaults
1082
+ # Use text-based spell checking instead of OCR for better accuracy
1083
+ # Pass image dimensions for proper coordinate mapping
1084
+ image_size = (a.width, a.height)
1085
+ misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
1086
+ misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
1087
+
1088
+ # Debug: Print spell check results
1089
+ print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
1090
+
1091
+ if HAS_BARCODE:
1092
+ # Use PDF-based barcode detection instead of rasterized image
1093
+ bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
1094
+ bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
1095
+
1096
+ # Debug: Print barcode detection results
1097
+ print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
1098
+ else:
1099
+ bar_a, info_a = [], []
1100
+ bar_b, info_b = [], []
1101
+
1102
+ # Always enable CMYK analysis
1103
+ cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
1104
+
1105
+ # Create visualizations with default box width
1106
+ a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3)
1107
+ b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3)
1108
+
1109
+ # Always show CMYK panel
1110
+ a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
1111
+ b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
1112
+
1113
+ # Create pixel difference overlay
1114
+ overlay = make_red_overlay(a, b)
1115
+
1116
+ # Create status message
1117
+ status = f"""
1118
+ πŸ“Š **Analysis Complete!**
1119
+ - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
1120
+ - **Difference regions found:** {len(red_boxes)}
1121
+ - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
1122
+ - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
1123
+ - **Combined image dimensions:** {a.width} Γ— {a.height} pixels
1124
+
1125
+ **Legend:**
1126
+ - πŸ”΄ Red boxes: Visual differences
1127
+ - πŸ”΅ Cyan boxes: Spelling errors
1128
+ - 🟒 Green boxes: Barcodes/QR codes
1129
+ """
1130
+
1131
+ # Prepare barcode data for tables
1132
+ codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
1133
+ c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
1134
+ codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
1135
+ c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
1136
+
1137
+ return overlay, a_disp, b_disp, status, codes_a, codes_b
1138
+
1139
+ except Exception as e:
1140
+ error_msg = f"❌ **Error:** {str(e)}"
1141
+ return None, None, None, error_msg, [], []
1142
+
1143
+ # -------------------- Gradio App -------------------
1144
+ def create_demo():
1145
+ # Create custom theme with light blue background
1146
+ # Create a simple, working theme with supported parameters only
1147
+ custom_theme = gr.themes.Soft(
1148
+ primary_hue="blue",
1149
+ neutral_hue="blue",
1150
+ font=gr.themes.GoogleFont("Inter"),
1151
+ ).set(
1152
+ body_background_fill="#99cfe9", # Light blue background
1153
+ body_background_fill_dark="#99cfe9",
1154
+ block_background_fill="#000000", # Black blocks for contrast
1155
+ block_background_fill_dark="#000000",
1156
+ border_color_primary="#333333", # Dark borders
1157
+ border_color_primary_dark="#333333",
1158
+ )
1159
+
1160
+ with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo:
1161
+ gr.Markdown("""
1162
+ # πŸ” Advanced PDF Comparison Tool
1163
+
1164
+ Upload two PDF files to get comprehensive analysis including:
1165
+ - **Multi-page PDF support** (up to 15 pages per document)
1166
+ - **Visual differences** with bounding boxes
1167
+ - **OCR and spell checking**
1168
+ - **Barcode/QR code detection**
1169
+ - **CMYK color analysis**
1170
+ """)
1171
+
1172
+ with gr.Row():
1173
+ with gr.Column():
1174
+ file_a = gr.File(label="πŸ“„ PDF A (Reference)", file_types=[".pdf"])
1175
+ file_b = gr.File(label="πŸ“„ PDF B (Comparison)", file_types=[".pdf"])
1176
+
1177
+ compare_btn = gr.Button("πŸ” Compare PDF Files", variant="primary", size="lg")
1178
+
1179
+ status_md = gr.Markdown("")
1180
+
1181
+ with gr.Row():
1182
+ overlay_img = gr.Image(label="πŸ”΄ Pixel Differences (Red = Different)", type="pil")
1183
+
1184
+ with gr.Row():
1185
+ img_a = gr.Image(label="πŸ“„ File A with Analysis", type="pil")
1186
+ img_b = gr.Image(label="πŸ“„ File B with Analysis", type="pil")
1187
+
1188
+ gr.Markdown("### πŸ“Š Barcode Detection Results")
1189
+ with gr.Row():
1190
+ codes_a_df = gr.Dataframe(
1191
+ headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
1192
+ label="Barcodes in File A",
1193
+ interactive=False
1194
+ )
1195
+ codes_b_df = gr.Dataframe(
1196
+ headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
1197
+ label="Barcodes in File B",
1198
+ interactive=False
1199
+ )
1200
+
1201
+ # Event handlers
1202
+ compare_btn.click(
1203
+ fn=compare_pdfs,
1204
+ inputs=[file_a, file_b],
1205
+ outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
1206
+ )
1207
+
1208
+ gr.Markdown("""
1209
+ ### πŸ“ Instructions:
1210
+ 1. Upload two PDF files
1211
+ 2. Click "Compare PDF Files"
1212
+ 3. View results with comprehensive analysis
1213
+
1214
+ ### 🎨 Color Legend:
1215
+ - **πŸ”΄ Red boxes:** Visual differences between files
1216
+ - **πŸ”΅ Cyan boxes:** Potential spelling errors (OCR)
1217
+ - **🟒 Green boxes:** Detected barcodes/QR codes
1218
+ - **πŸ“Š Side panel:** CMYK color analysis for print workflows
1219
+ """)
1220
+
1221
+ return demo
1222
+
1223
+ def _binarize(pil_img: Image.Image) -> Image.Image:
1224
+ """Create a binarized (black/white) version of the image for better barcode detection"""
1225
+ g = ImageOps.grayscale(pil_img)
1226
+ g = ImageOps.autocontrast(g)
1227
+ return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')
1228
+
1229
+ def _decode_once(img: Image.Image):
1230
+ """Single decode attempt with common barcode symbols"""
1231
+ if not HAS_BARCODE:
1232
+ return []
1233
+ syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128]
1234
+ return zbar_decode(img, symbols=syms)
1235
+
1236
+ def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
1237
+ """
1238
+ Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues.
1239
+
1240
+ This function:
1241
+ - Renders pages at 600/900/1200 DPI
1242
+ - Tries grayscale, binarized, and rotated versions
1243
+ - Scans embedded images (XObjects)
1244
+ - Prints what it finds and writes debug PNGs
1245
+ - Helps identify if barcodes are too thin/low resolution
1246
+
1247
+ Usage:
1248
+ debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2)
1249
+ """
1250
+ if not (HAS_BARCODE and HAS_PYMUPDF):
1251
+ print("ERROR: Missing dependencies (pyzbar or PyMuPDF)")
1252
+ return
1253
+
1254
+ os.makedirs(outdir, exist_ok=True)
1255
+ doc = fitz.open(pdf_path)
1256
+
1257
+ for dpi in (600, 900, 1200):
1258
+ scale = dpi / 72.0
1259
+ mat = fitz.Matrix(scale, scale)
1260
+ print(f"\n=== DPI {dpi} ===")
1261
+
1262
+ for p in range(min(len(doc), max_pages)):
1263
+ page = doc[p]
1264
+ pix = page.get_pixmap(matrix=mat, alpha=False)
1265
+ img = Image.open(io.BytesIO(pix.tobytes("ppm")))
1266
+ img.save(f"{outdir}/page{p+1}_{dpi}.png")
1267
+
1268
+ # Try different image variants
1269
+ variants = [
1270
+ ("orig", img),
1271
+ ("gray", ImageOps.grayscale(img)),
1272
+ ("bin", _binarize(img)),
1273
+ ]
1274
+ found = []
1275
+
1276
+ for tag, v in variants:
1277
+ r = _decode_once(v)
1278
+ if r:
1279
+ found.extend((tag, rr.type, rr.data) for rr in r)
1280
+ else:
1281
+ # Try rotations
1282
+ for angle in (90, 180, 270):
1283
+ rr = _decode_once(v.rotate(angle, expand=True))
1284
+ if rr:
1285
+ found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr)
1286
+ break
1287
+
1288
+ print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}")
1289
+
1290
+ # Scan embedded images too
1291
+ imgs = page.get_images(full=True)
1292
+ for ix, (xref, *_) in enumerate(imgs):
1293
+ try:
1294
+ ipix = fitz.Pixmap(doc, xref)
1295
+ if ipix.alpha:
1296
+ ipix = fitz.Pixmap(ipix, 0)
1297
+ pil = Image.open(io.BytesIO(ipix.tobytes("ppm")))
1298
+ pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png")
1299
+ rr = _decode_once(pil) or _decode_once(_binarize(pil))
1300
+ if rr:
1301
+ print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
1302
+ except Exception as e:
1303
+ print(" Embedded image error:", e)
1304
+
1305
+ doc.close()
1306
+ print(f"\nDebug images saved to: {outdir}/")
1307
+ print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
1308
+
1309
+ def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
1310
+ """Detect barcodes from the original PDF and return boxes in the same
1311
+ coordinate space as the combined display image.
1312
+
1313
+ If image_size is provided (w,h of the vertically combined display image),
1314
+ each page is rendered so its width matches w, then decoded. Box y-coordinates
1315
+ are offset by the cumulative height of previous pages so that all boxes map
1316
+ into the combined image space correctly.
1317
+ """
1318
+ boxes: List[Box] = []
1319
+ infos: List[Dict[str, Any]] = []
1320
+ try:
1321
+ doc = fitz.open(pdf_path)
1322
+ num_pages = min(len(doc), max_pages)
1323
+ if num_pages == 0:
1324
+ return [], []
1325
+
1326
+ target_width = None
1327
+ if image_size:
1328
+ target_width = int(image_size[0])
1329
+
1330
+ y_offset = 0
1331
+ for page_idx in range(num_pages):
1332
+ page = doc[page_idx]
1333
+ # Compute scale so that rendered width matches target_width when provided
1334
+ if target_width:
1335
+ page_width_pts = float(page.rect.width) # points (72 dpi)
1336
+ scale = max(1.0, target_width / page_width_pts)
1337
+ else:
1338
+ # fallback dpi ~600
1339
+ scale = 600.0 / 72.0
1340
+ try:
1341
+ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
1342
+ except TypeError:
1343
+ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
1344
+ pil = _pix_to_pil(pix)
1345
+ pw, ph = pil.size
1346
+ hits = _decode_variants(pil)
1347
+ for r in hits:
1348
+ x1 = int(r.get("left", 0))
1349
+ y1 = int(r.get("top", 0)) + y_offset
1350
+ w = int(r.get("width", 0))
1351
+ h = int(r.get("height", 0))
1352
+ x2 = x1 + w
1353
+ y2 = y1 + h
1354
+ b = Box(y1, x1, y2, x2, w * h)
1355
+ # Exclude bottom 115mm for combined image if we know full height; else per-page
1356
+ if image_size and _is_in_excluded_bottom_area(b, image_size[1]):
1357
+ continue
1358
+ if not image_size and _is_in_excluded_bottom_area(b, ph):
1359
+ continue
1360
+ boxes.append(b)
1361
+ sym, payload = r.get("type", ""), r.get("data", "")
1362
+ infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"})
1363
+ y_offset += ph
1364
+ doc.close()
1365
+ except Exception:
1366
+ return [], []
1367
+ return boxes, infos
1368
+
1369
+ if __name__ == "__main__":
1370
+ demo = create_demo()
1371
+ demo.launch(
1372
+ server_name="0.0.0.0", # Allow external access
1373
+ share=True, # Set to True to create a public link
1374
+ show_error=True
1375
+ )