Yaz Hobooti
commited on
Commit
Β·
7c19dd8
1
Parent(s):
4e0d8d0
chore: push current local state (diff guards)
Browse files- pdf_comparator_backup.py +1375 -0
pdf_comparator_backup.py
ADDED
|
@@ -0,0 +1,1375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio PDF Comparison Tool
|
| 4 |
+
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os, sys, re, csv, json, io
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import List, Tuple, Optional, Iterable
|
| 10 |
+
import tempfile
|
| 11 |
+
import unicodedata
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
|
| 15 |
+
from pdf2image import convert_from_path
|
| 16 |
+
from skimage.measure import label, regionprops
|
| 17 |
+
from skimage.morphology import dilation, rectangle
|
| 18 |
+
import gradio as gr
|
| 19 |
+
|
| 20 |
+
# Alternative PDF processing
|
| 21 |
+
try:
|
| 22 |
+
import fitz # PyMuPDF
|
| 23 |
+
HAS_PYMUPDF = True
|
| 24 |
+
except Exception:
|
| 25 |
+
fitz = None
|
| 26 |
+
HAS_PYMUPDF = False
|
| 27 |
+
|
| 28 |
+
# Optional features
|
| 29 |
+
try:
|
| 30 |
+
import pytesseract
|
| 31 |
+
HAS_OCR = True
|
| 32 |
+
except Exception:
|
| 33 |
+
pytesseract = None
|
| 34 |
+
HAS_OCR = False
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from spellchecker import SpellChecker
|
| 38 |
+
HAS_SPELLCHECK = True
|
| 39 |
+
except Exception:
|
| 40 |
+
SpellChecker = None
|
| 41 |
+
HAS_SPELLCHECK = False
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
import regex as re
|
| 45 |
+
HAS_REGEX = True
|
| 46 |
+
except Exception:
|
| 47 |
+
import re
|
| 48 |
+
HAS_REGEX = False
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from pyzbar.pyzbar import decode as zbar_decode
|
| 52 |
+
HAS_BARCODE = True
|
| 53 |
+
except Exception:
|
| 54 |
+
zbar_decode = None
|
| 55 |
+
HAS_BARCODE = False
|
| 56 |
+
|
| 57 |
+
# -------------------- Core Data --------------------
|
| 58 |
+
@dataclass
|
| 59 |
+
class Box:
|
| 60 |
+
y1: int; x1: int; y2: int; x2: int; area: int
|
| 61 |
+
|
| 62 |
+
# ---- spell/tokenization helpers & caches ----
|
| 63 |
+
if HAS_REGEX:
|
| 64 |
+
# Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
|
| 65 |
+
_WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
|
| 66 |
+
else:
|
| 67 |
+
# Fallback regex for basic ASCII
|
| 68 |
+
_WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
|
| 69 |
+
|
| 70 |
+
if HAS_SPELLCHECK:
|
| 71 |
+
# Initialize English spell checker with comprehensive dictionary
|
| 72 |
+
_SPELL_EN = SpellChecker(language="en")
|
| 73 |
+
|
| 74 |
+
# Try to initialize French spell checker with fallback
|
| 75 |
+
_SPELL_FR = None
|
| 76 |
+
try:
|
| 77 |
+
_SPELL_FR = SpellChecker(language="fr")
|
| 78 |
+
except Exception:
|
| 79 |
+
# If French dictionary fails, try alternative approach
|
| 80 |
+
try:
|
| 81 |
+
_SPELL_FR = SpellChecker()
|
| 82 |
+
# Load some basic French words manually if needed
|
| 83 |
+
except Exception:
|
| 84 |
+
_SPELL_FR = None
|
| 85 |
+
print("Warning: French spell checker not available")
|
| 86 |
+
else:
|
| 87 |
+
_SPELL_EN = None
|
| 88 |
+
_SPELL_FR = None
|
| 89 |
+
|
| 90 |
+
_DOMAIN_ALLOWLIST = {
|
| 91 |
+
# Company/Brand names
|
| 92 |
+
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
|
| 93 |
+
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid",
|
| 94 |
+
|
| 95 |
+
# Technical terms
|
| 96 |
+
"CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP",
|
| 97 |
+
"Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure",
|
| 98 |
+
"Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock",
|
| 99 |
+
|
| 100 |
+
# Common abbreviations
|
| 101 |
+
"Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd",
|
| 102 |
+
"USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU",
|
| 103 |
+
|
| 104 |
+
# French words (common in Canadian context)
|
| 105 |
+
"QuΓ©bec", "MontrΓ©al", "Toronto", "Vancouver", "Ottawa", "Calgary",
|
| 106 |
+
"franΓ§ais", "franΓ§aise", "anglais", "anglaise", "bilingue",
|
| 107 |
+
|
| 108 |
+
# Common business terms
|
| 109 |
+
"Marketing", "Sales", "Customer", "Service", "Quality", "Control",
|
| 110 |
+
"Management", "Administration", "Production", "Manufacturing",
|
| 111 |
+
"Distribution", "Logistics", "Supply", "Chain", "Inventory",
|
| 112 |
+
|
| 113 |
+
# Common words that might be flagged
|
| 114 |
+
"Email", "Website", "Online", "Internet", "Software", "Hardware",
|
| 115 |
+
"Database", "System", "Network", "Server", "Client", "User",
|
| 116 |
+
"Password", "Login", "Logout", "Account", "Profile", "Settings",
|
| 117 |
+
"Configuration", "Installation", "Maintenance", "Support",
|
| 118 |
+
|
| 119 |
+
# Numbers and measurements
|
| 120 |
+
"mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in",
|
| 121 |
+
"x", "by", "times", "multiply", "divide", "plus", "minus",
|
| 122 |
+
|
| 123 |
+
# Common misspellings that are actually correct in context
|
| 124 |
+
"colour", "colour", "favour", "favour", "honour", "honour",
|
| 125 |
+
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
|
| 126 |
+
"theatre", "theatre", "metre", "metre", "litre", "litre",
|
| 127 |
+
|
| 128 |
+
# Pharmaceutical terms
|
| 129 |
+
"glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate",
|
| 130 |
+
"ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
|
| 131 |
+
"stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
|
| 132 |
+
"eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
|
| 133 |
+
"hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
|
| 134 |
+
"phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
|
| 135 |
+
"sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
|
| 136 |
+
"copper", "manganese", "selenium", "chromium", "molybdenum",
|
| 137 |
+
"thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
|
| 138 |
+
"biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
|
| 139 |
+
"phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
|
| 140 |
+
"creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
|
| 141 |
+
"isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
|
| 142 |
+
"cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
|
| 143 |
+
"asparagine", "glutamic", "aspartic", "alanine", "glycine",
|
| 144 |
+
"polysorbate", "monostearate", "distearate", "tristearate",
|
| 145 |
+
"polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
|
| 146 |
+
"cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
|
| 147 |
+
"microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
|
| 148 |
+
"maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
|
| 149 |
+
"stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
|
| 150 |
+
"eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
|
| 151 |
+
"conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
|
| 152 |
+
"hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
|
| 153 |
+
"phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
|
| 154 |
+
"malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
|
| 155 |
+
"palmitate", "stearate", "oleate", "linoleate", "arachidonate"
|
| 156 |
+
}
|
| 157 |
+
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
|
| 158 |
+
|
| 159 |
+
if _SPELL_EN:
|
| 160 |
+
_SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
|
| 161 |
+
if _SPELL_FR:
|
| 162 |
+
_SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
|
| 163 |
+
|
| 164 |
+
def _normalize_text(s: str) -> str:
|
| 165 |
+
"""Normalize text for better word extraction"""
|
| 166 |
+
if not s:
|
| 167 |
+
return ""
|
| 168 |
+
|
| 169 |
+
# Unicode normalization
|
| 170 |
+
s = unicodedata.normalize("NFC", s)
|
| 171 |
+
|
| 172 |
+
# Fix common apostrophe issues
|
| 173 |
+
s = s.replace("'", "'").replace("'", "'")
|
| 174 |
+
|
| 175 |
+
# Normalize whitespace - replace multiple spaces with single space
|
| 176 |
+
s = re.sub(r'\s+', ' ', s)
|
| 177 |
+
|
| 178 |
+
# Remove leading/trailing whitespace
|
| 179 |
+
s = s.strip()
|
| 180 |
+
|
| 181 |
+
return s
|
| 182 |
+
|
| 183 |
+
def _extract_tokens(raw: str):
|
| 184 |
+
"""Extract word tokens with improved filtering"""
|
| 185 |
+
s = _normalize_text(raw or "")
|
| 186 |
+
tokens = _WORD_RE.findall(s)
|
| 187 |
+
|
| 188 |
+
# Filter out tokens that are too short or don't look like words
|
| 189 |
+
filtered_tokens = []
|
| 190 |
+
for token in tokens:
|
| 191 |
+
if len(token) >= 2 and _is_likely_word(token):
|
| 192 |
+
filtered_tokens.append(token)
|
| 193 |
+
|
| 194 |
+
return filtered_tokens
|
| 195 |
+
|
| 196 |
+
def _looks_like_acronym(tok: str) -> bool:
|
| 197 |
+
"""Check if token looks like a valid acronym"""
|
| 198 |
+
return tok.isupper() and 2 <= len(tok) <= 6
|
| 199 |
+
|
| 200 |
+
def _has_digits(tok: str) -> bool:
|
| 201 |
+
"""Check if token contains digits"""
|
| 202 |
+
return any(ch.isdigit() for ch in tok)
|
| 203 |
+
|
| 204 |
+
def _is_mostly_numbers(tok: str) -> bool:
|
| 205 |
+
"""Check if token is mostly numbers (should be ignored)"""
|
| 206 |
+
if not tok:
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
# Count digits and letters
|
| 210 |
+
digit_count = sum(1 for ch in tok if ch.isdigit())
|
| 211 |
+
letter_count = sum(1 for ch in tok if ch.isalpha())
|
| 212 |
+
total_chars = len(tok)
|
| 213 |
+
|
| 214 |
+
# If more than 70% digits, consider it mostly numbers
|
| 215 |
+
if digit_count / total_chars > 0.7:
|
| 216 |
+
return True
|
| 217 |
+
|
| 218 |
+
# If it's a pure number (all digits), ignore it
|
| 219 |
+
if digit_count == total_chars:
|
| 220 |
+
return True
|
| 221 |
+
|
| 222 |
+
# If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
|
| 223 |
+
if total_chars >= 2 and digit_count >= 1:
|
| 224 |
+
suffix = tok[-2:].lower()
|
| 225 |
+
if suffix in ['st', 'nd', 'rd', 'th']:
|
| 226 |
+
return True
|
| 227 |
+
|
| 228 |
+
# If it's a decimal number (contains digits and decimal point)
|
| 229 |
+
if '.' in tok and digit_count > 0:
|
| 230 |
+
return True
|
| 231 |
+
|
| 232 |
+
# If it's a percentage (ends with %)
|
| 233 |
+
if tok.endswith('%') and digit_count > 0:
|
| 234 |
+
return True
|
| 235 |
+
|
| 236 |
+
return False
|
| 237 |
+
|
| 238 |
+
def _is_likely_word(tok: str) -> bool:
|
| 239 |
+
"""Check if token looks like a real word (not random characters)"""
|
| 240 |
+
if len(tok) < 2:
|
| 241 |
+
return False
|
| 242 |
+
|
| 243 |
+
# Filter out tokens that are mostly non-letter characters
|
| 244 |
+
letter_count = sum(1 for c in tok if c.isalpha())
|
| 245 |
+
if letter_count < len(tok) * 0.6: # At least 60% letters
|
| 246 |
+
return False
|
| 247 |
+
|
| 248 |
+
# Filter out tokens with too many consecutive consonants/vowels
|
| 249 |
+
vowels = set('aeiouAEIOU')
|
| 250 |
+
consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
|
| 251 |
+
|
| 252 |
+
# Check for excessive consonant clusters (like "qwerty" or "zxcvb")
|
| 253 |
+
if len(tok) >= 4:
|
| 254 |
+
consonant_clusters = 0
|
| 255 |
+
vowel_clusters = 0
|
| 256 |
+
for i in range(len(tok) - 2):
|
| 257 |
+
if tok[i:i+3].lower() in consonants:
|
| 258 |
+
consonant_clusters += 1
|
| 259 |
+
if tok[i:i+3].lower() in vowels:
|
| 260 |
+
vowel_clusters += 1
|
| 261 |
+
|
| 262 |
+
# If more than half the possible clusters are consonant clusters, likely not a word
|
| 263 |
+
if consonant_clusters > len(tok) * 0.3:
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
# Filter out tokens that look like random keyboard patterns
|
| 267 |
+
keyboard_patterns = [
|
| 268 |
+
'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
|
| 269 |
+
'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
|
| 270 |
+
'123456', '234567', '345678', '456789', '567890'
|
| 271 |
+
]
|
| 272 |
+
|
| 273 |
+
tok_lower = tok.lower()
|
| 274 |
+
for pattern in keyboard_patterns:
|
| 275 |
+
if pattern in tok_lower or tok_lower in pattern:
|
| 276 |
+
return False
|
| 277 |
+
|
| 278 |
+
return True
|
| 279 |
+
|
| 280 |
+
def _is_known_word(tok: str) -> bool:
|
| 281 |
+
"""Check if token is a known word with comprehensive filtering"""
|
| 282 |
+
t = tok.lower()
|
| 283 |
+
|
| 284 |
+
# First check if it looks like a real word
|
| 285 |
+
if not _is_likely_word(tok):
|
| 286 |
+
return True # Don't flag non-words as misspellings
|
| 287 |
+
|
| 288 |
+
# Ignore numbers and mostly numeric tokens
|
| 289 |
+
if _is_mostly_numbers(tok):
|
| 290 |
+
return True # Don't flag numbers as misspellings
|
| 291 |
+
|
| 292 |
+
# Check domain allowlist, acronyms, and words with digits
|
| 293 |
+
if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
|
| 294 |
+
return True
|
| 295 |
+
|
| 296 |
+
# Check hyphenated words - if any part is known, consider the whole word known
|
| 297 |
+
if '-' in tok:
|
| 298 |
+
parts = tok.split('-')
|
| 299 |
+
if all(_is_known_word(part) for part in parts):
|
| 300 |
+
return True
|
| 301 |
+
|
| 302 |
+
# Check against English spell checker
|
| 303 |
+
if _SPELL_EN:
|
| 304 |
+
try:
|
| 305 |
+
# Check if word is known in English dictionary
|
| 306 |
+
if not _SPELL_EN.unknown([t]):
|
| 307 |
+
return True
|
| 308 |
+
except Exception:
|
| 309 |
+
pass
|
| 310 |
+
|
| 311 |
+
# Check against French spell checker
|
| 312 |
+
if _SPELL_FR:
|
| 313 |
+
try:
|
| 314 |
+
# Check if word is known in French dictionary
|
| 315 |
+
if not _SPELL_FR.unknown([t]):
|
| 316 |
+
return True
|
| 317 |
+
except Exception:
|
| 318 |
+
pass
|
| 319 |
+
|
| 320 |
+
# Additional checks for common patterns
|
| 321 |
+
# Check for common suffixes/prefixes that might not be in dictionaries
|
| 322 |
+
common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible']
|
| 323 |
+
common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down']
|
| 324 |
+
|
| 325 |
+
# Check if word with common suffix/prefix is known
|
| 326 |
+
for suffix in common_suffixes:
|
| 327 |
+
if t.endswith(suffix) and len(t) > len(suffix) + 2:
|
| 328 |
+
base_word = t[:-len(suffix)]
|
| 329 |
+
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
|
| 330 |
+
return True
|
| 331 |
+
|
| 332 |
+
for prefix in common_prefixes:
|
| 333 |
+
if t.startswith(prefix) and len(t) > len(prefix) + 2:
|
| 334 |
+
base_word = t[len(prefix):]
|
| 335 |
+
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
|
| 336 |
+
return True
|
| 337 |
+
|
| 338 |
+
# Check for plural forms (simple 's' ending)
|
| 339 |
+
if t.endswith('s') and len(t) > 3:
|
| 340 |
+
singular = t[:-1]
|
| 341 |
+
if _SPELL_EN and not _SPELL_EN.unknown([singular]):
|
| 342 |
+
return True
|
| 343 |
+
|
| 344 |
+
return False
|
| 345 |
+
|
| 346 |
+
# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
|
| 347 |
+
def normalize_token(token: str) -> str:
|
| 348 |
+
toks = _extract_tokens(token)
|
| 349 |
+
return (toks[0].lower() if toks else "")
|
| 350 |
+
|
| 351 |
+
# -------------------- Helpers ----------------------
|
| 352 |
+
def _is_pdf(path: str) -> bool:
|
| 353 |
+
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 354 |
+
|
| 355 |
+
def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
|
| 356 |
+
"""
|
| 357 |
+
Check if a box is in the excluded bottom area (115mm from bottom).
|
| 358 |
+
Converts mm to pixels using DPI.
|
| 359 |
+
"""
|
| 360 |
+
# Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
|
| 361 |
+
excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
|
| 362 |
+
|
| 363 |
+
# Calculate the top boundary of the excluded area
|
| 364 |
+
excluded_top = image_height - excluded_height_pixels
|
| 365 |
+
|
| 366 |
+
# Check if the box intersects with the excluded area
|
| 367 |
+
return box.y1 >= excluded_top
|
| 368 |
+
|
| 369 |
+
def _contains_validation_text(text: str) -> bool:
|
| 370 |
+
"""Check if text contains the validation text '50 Carroll'"""
|
| 371 |
+
return "50 Carroll" in text
|
| 372 |
+
|
| 373 |
+
def load_pdf_pages(path: str, dpi: int = 600, max_pages: int = 15) -> List[Image.Image]:
|
| 374 |
+
"""Load PDF pages as images with fallback options"""
|
| 375 |
+
if not _is_pdf(path):
|
| 376 |
+
return [Image.open(path).convert("RGB")]
|
| 377 |
+
|
| 378 |
+
# Try pdf2image first
|
| 379 |
+
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
| 380 |
+
|
| 381 |
+
for poppler_path in poppler_paths:
|
| 382 |
+
try:
|
| 383 |
+
if poppler_path:
|
| 384 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
|
| 385 |
+
else:
|
| 386 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
|
| 387 |
+
|
| 388 |
+
if imgs:
|
| 389 |
+
return [img.convert("RGB") for img in imgs]
|
| 390 |
+
except Exception:
|
| 391 |
+
if poppler_path is None: # All pdf2image attempts failed
|
| 392 |
+
break
|
| 393 |
+
continue # Try next path
|
| 394 |
+
|
| 395 |
+
# Fallback to PyMuPDF
|
| 396 |
+
if HAS_PYMUPDF:
|
| 397 |
+
try:
|
| 398 |
+
doc = fitz.open(path)
|
| 399 |
+
pages = []
|
| 400 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 401 |
+
page = doc[page_num]
|
| 402 |
+
mat = fitz.Matrix(dpi/72, dpi/72)
|
| 403 |
+
pix = page.get_pixmap(matrix=mat)
|
| 404 |
+
img_data = pix.tobytes("ppm")
|
| 405 |
+
img = Image.open(io.BytesIO(img_data))
|
| 406 |
+
pages.append(img.convert("RGB"))
|
| 407 |
+
doc.close()
|
| 408 |
+
return pages
|
| 409 |
+
except Exception as e:
|
| 410 |
+
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
|
| 411 |
+
|
| 412 |
+
raise ValueError("Failed to convert PDF to image. No working method available.")
|
| 413 |
+
|
| 414 |
+
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
|
| 415 |
+
"""Combine multiple pages into a single vertical image"""
|
| 416 |
+
if not pages:
|
| 417 |
+
raise ValueError("No pages to combine")
|
| 418 |
+
if len(pages) == 1:
|
| 419 |
+
return pages[0]
|
| 420 |
+
|
| 421 |
+
# Find the maximum width
|
| 422 |
+
max_width = max(page.width for page in pages)
|
| 423 |
+
|
| 424 |
+
# Calculate total height
|
| 425 |
+
total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
|
| 426 |
+
|
| 427 |
+
# Create combined image
|
| 428 |
+
combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
|
| 429 |
+
|
| 430 |
+
y_offset = 0
|
| 431 |
+
for page in pages:
|
| 432 |
+
# Center the page horizontally if it's narrower than max_width
|
| 433 |
+
x_offset = (max_width - page.width) // 2
|
| 434 |
+
combined.paste(page, (x_offset, y_offset))
|
| 435 |
+
y_offset += page.height + spacing
|
| 436 |
+
|
| 437 |
+
return combined
|
| 438 |
+
|
| 439 |
+
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
| 440 |
+
if a.size == b.size:
|
| 441 |
+
return a, b
|
| 442 |
+
w, h = min(a.width, b.width), min(a.height, b.height)
|
| 443 |
+
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
|
| 444 |
+
|
| 445 |
+
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
|
| 446 |
+
return ImageChops.difference(a, b)
|
| 447 |
+
|
| 448 |
+
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
|
| 449 |
+
arr = np.asarray(diff_img).astype(np.uint16)
|
| 450 |
+
gray = arr.max(axis=2).astype(np.uint8)
|
| 451 |
+
mask = (gray >= threshold).astype(np.uint8)
|
| 452 |
+
mask = dilation(mask, rectangle(3, 3))
|
| 453 |
+
labeled = label(mask, connectivity=2)
|
| 454 |
+
out: List[Box] = []
|
| 455 |
+
img_height = diff_img.height
|
| 456 |
+
|
| 457 |
+
for p in regionprops(labeled):
|
| 458 |
+
if p.area < min_area:
|
| 459 |
+
continue
|
| 460 |
+
minr, minc, maxr, maxc = p.bbox
|
| 461 |
+
box = Box(minr, minc, maxr, maxc, int(p.area))
|
| 462 |
+
|
| 463 |
+
# Skip boxes in the excluded bottom area
|
| 464 |
+
if _is_in_excluded_bottom_area(box, img_height):
|
| 465 |
+
continue
|
| 466 |
+
|
| 467 |
+
out.append(box)
|
| 468 |
+
return out
|
| 469 |
+
|
| 470 |
+
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
|
| 471 |
+
width: int = 3) -> Image.Image:
|
| 472 |
+
out = img.copy(); d = ImageDraw.Draw(out)
|
| 473 |
+
# red (diff)
|
| 474 |
+
for b in red_boxes:
|
| 475 |
+
for w in range(width):
|
| 476 |
+
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
|
| 477 |
+
# cyan (misspellings)
|
| 478 |
+
for b in cyan_boxes:
|
| 479 |
+
for w in range(width):
|
| 480 |
+
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
|
| 481 |
+
# green (barcodes)
|
| 482 |
+
if green_boxes:
|
| 483 |
+
for b in green_boxes:
|
| 484 |
+
for w in range(width):
|
| 485 |
+
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
|
| 486 |
+
return out
|
| 487 |
+
|
| 488 |
+
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
|
| 489 |
+
A = np.asarray(a).copy(); B = np.asarray(b)
|
| 490 |
+
mask = np.any(A != B, axis=2)
|
| 491 |
+
A[mask] = [255, 0, 0]
|
| 492 |
+
return Image.fromarray(A)
|
| 493 |
+
|
| 494 |
+
# -------------------- OCR + Spellcheck -------------
|
| 495 |
+
from typing import List, Iterable, Optional
|
| 496 |
+
from PIL import Image
|
| 497 |
+
import unicodedata
|
| 498 |
+
import regex as re
|
| 499 |
+
import pytesseract
|
| 500 |
+
from spellchecker import SpellChecker
|
| 501 |
+
|
| 502 |
+
# If these existed in your file, keep them; otherwise define defaults to avoid NameError
|
| 503 |
+
try:
|
| 504 |
+
HAS_OCR
|
| 505 |
+
except NameError:
|
| 506 |
+
HAS_OCR = True
|
| 507 |
+
try:
|
| 508 |
+
HAS_SPELLCHECK
|
| 509 |
+
except NameError:
|
| 510 |
+
HAS_SPELLCHECK = True
|
| 511 |
+
|
| 512 |
+
# ---- spell/tokenization helpers & caches ----
|
| 513 |
+
_WORD_RE = re.compile(r"\p{Letter}+(?:[β'\-]\p{Letter}+)*", re.UNICODE)
|
| 514 |
+
|
| 515 |
+
_SPELL_EN = SpellChecker(language="en")
|
| 516 |
+
_SPELL_FR = SpellChecker(language="fr")
|
| 517 |
+
|
| 518 |
+
_DOMAIN_ALLOWLIST = {
|
| 519 |
+
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
|
| 520 |
+
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
|
| 521 |
+
}
|
| 522 |
+
_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
|
| 523 |
+
_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
|
| 524 |
+
|
| 525 |
+
def _normalize_text(s: str) -> str:
|
| 526 |
+
s = unicodedata.normalize("NFC", s)
|
| 527 |
+
return s.replace("'", "'").strip()
|
| 528 |
+
|
| 529 |
+
def _looks_like_acronym(tok: str) -> bool:
|
| 530 |
+
return tok.isupper() and 2 <= len(tok) <= 6
|
| 531 |
+
|
| 532 |
+
def _has_digits(tok: str) -> bool:
|
| 533 |
+
return any(ch.isdigit() for ch in tok)
|
| 534 |
+
|
| 535 |
+
# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
|
| 536 |
+
def normalize_token(token: str) -> str:
|
| 537 |
+
toks = _extract_tokens(token)
|
| 538 |
+
return (toks[0].lower() if toks else "")
|
| 539 |
+
|
| 540 |
+
def _get_available_tesseract_langs():
|
| 541 |
+
"""Get available Tesseract languages"""
|
| 542 |
+
try:
|
| 543 |
+
langs = pytesseract.get_languages()
|
| 544 |
+
if 'eng' in langs and 'fra' in langs:
|
| 545 |
+
return "eng+fra"
|
| 546 |
+
elif 'eng' in langs:
|
| 547 |
+
return "eng"
|
| 548 |
+
elif langs:
|
| 549 |
+
return langs[0]
|
| 550 |
+
else:
|
| 551 |
+
return "eng"
|
| 552 |
+
except Exception:
|
| 553 |
+
return "eng"
|
| 554 |
+
|
| 555 |
+
def prepare_for_ocr(img: Image.Image) -> Image.Image:
|
| 556 |
+
"""Prepare image for better OCR results"""
|
| 557 |
+
from PIL import ImageOps, ImageFilter
|
| 558 |
+
g = img.convert("L")
|
| 559 |
+
g = ImageOps.autocontrast(g)
|
| 560 |
+
g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
|
| 561 |
+
return g
|
| 562 |
+
|
| 563 |
+
def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
|
| 564 |
+
"""Extract text directly from PDF using PyMuPDF"""
|
| 565 |
+
if not HAS_PYMUPDF:
|
| 566 |
+
return []
|
| 567 |
+
|
| 568 |
+
try:
|
| 569 |
+
doc = fitz.open(path)
|
| 570 |
+
texts = []
|
| 571 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 572 |
+
page = doc[page_num]
|
| 573 |
+
text = page.get_text()
|
| 574 |
+
texts.append(text)
|
| 575 |
+
doc.close()
|
| 576 |
+
return texts
|
| 577 |
+
except Exception:
|
| 578 |
+
return []
|
| 579 |
+
|
| 580 |
+
def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
|
| 581 |
+
"""Convert PDF coordinates to image coordinates"""
|
| 582 |
+
pdf_width, pdf_height = pdf_page_size
|
| 583 |
+
img_width, img_height = image_size
|
| 584 |
+
|
| 585 |
+
# Scale factors
|
| 586 |
+
scale_x = img_width / pdf_width
|
| 587 |
+
scale_y = img_height / pdf_height
|
| 588 |
+
|
| 589 |
+
# Convert PDF coordinates to image coordinates
|
| 590 |
+
x1 = int(pdf_bbox[0] * scale_x)
|
| 591 |
+
y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
|
| 592 |
+
x2 = int(pdf_bbox[2] * scale_x)
|
| 593 |
+
y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
|
| 594 |
+
|
| 595 |
+
return x1, y1, x2, y2
|
| 596 |
+
|
| 597 |
+
def find_misspell_boxes_from_text(
|
| 598 |
+
pdf_path: str,
|
| 599 |
+
*,
|
| 600 |
+
extra_allow: Optional[Iterable[str]] = None,
|
| 601 |
+
max_pages: int = 5,
|
| 602 |
+
image_size: Optional[Tuple[int, int]] = None
|
| 603 |
+
) -> List[Box]:
|
| 604 |
+
"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
|
| 605 |
+
if not (HAS_SPELLCHECK and HAS_PYMUPDF):
|
| 606 |
+
return []
|
| 607 |
+
|
| 608 |
+
# Load extra allowed words
|
| 609 |
+
if extra_allow and _SPELL_EN:
|
| 610 |
+
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 611 |
+
if extra_allow and _SPELL_FR:
|
| 612 |
+
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 613 |
+
|
| 614 |
+
boxes: List[Box] = []
|
| 615 |
+
|
| 616 |
+
try:
|
| 617 |
+
doc = fitz.open(pdf_path)
|
| 618 |
+
|
| 619 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 620 |
+
page = doc[page_num]
|
| 621 |
+
|
| 622 |
+
# Get text with position information
|
| 623 |
+
text_dict = page.get_text("dict")
|
| 624 |
+
|
| 625 |
+
# Process each block of text
|
| 626 |
+
for block in text_dict.get("blocks", []):
|
| 627 |
+
if "lines" not in block:
|
| 628 |
+
continue
|
| 629 |
+
|
| 630 |
+
for line in block["lines"]:
|
| 631 |
+
for span in line["spans"]:
|
| 632 |
+
text = span.get("text", "").strip()
|
| 633 |
+
if not text:
|
| 634 |
+
continue
|
| 635 |
+
|
| 636 |
+
# Extract tokens and check for misspellings
|
| 637 |
+
tokens = _extract_tokens(text)
|
| 638 |
+
has_misspelling = False
|
| 639 |
+
|
| 640 |
+
for token in tokens:
|
| 641 |
+
if len(token) >= 2 and not _is_known_word(token):
|
| 642 |
+
has_misspelling = True
|
| 643 |
+
break
|
| 644 |
+
|
| 645 |
+
# If this span has misspellings, create a box for it
|
| 646 |
+
if has_misspelling:
|
| 647 |
+
bbox = span["bbox"] # [x0, y0, x1, y1]
|
| 648 |
+
|
| 649 |
+
# Get page dimensions for coordinate conversion
|
| 650 |
+
page_rect = page.rect
|
| 651 |
+
pdf_width = page_rect.width
|
| 652 |
+
pdf_height = page_rect.height
|
| 653 |
+
|
| 654 |
+
# Calculate coordinates
|
| 655 |
+
if image_size:
|
| 656 |
+
img_width, img_height = image_size
|
| 657 |
+
# Convert PDF coordinates to image coordinates
|
| 658 |
+
scale_x = img_width / pdf_width
|
| 659 |
+
scale_y = img_height / pdf_height
|
| 660 |
+
x1 = int(bbox[0] * scale_x)
|
| 661 |
+
y1 = int(bbox[1] * scale_y) + (page_num * img_height)
|
| 662 |
+
x2 = int(bbox[2] * scale_x)
|
| 663 |
+
y2 = int(bbox[3] * scale_y) + (page_num * img_height)
|
| 664 |
+
else:
|
| 665 |
+
x1 = int(bbox[0])
|
| 666 |
+
y1 = int(bbox[1]) + (page_num * 1000)
|
| 667 |
+
x2 = int(bbox[2])
|
| 668 |
+
y2 = int(bbox[3]) + (page_num * 1000)
|
| 669 |
+
|
| 670 |
+
# Create box
|
| 671 |
+
box = Box(y1=y1, x1=x1, y2=y2, x2=x2, area=(x2 - x1) * (y2 - y1))
|
| 672 |
+
|
| 673 |
+
# Skip boxes in excluded bottom area unless they contain validation text
|
| 674 |
+
if image_size:
|
| 675 |
+
img_height = image_size[1]
|
| 676 |
+
if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
|
| 677 |
+
continue
|
| 678 |
+
|
| 679 |
+
boxes.append(box)
|
| 680 |
+
|
| 681 |
+
doc.close()
|
| 682 |
+
|
| 683 |
+
except Exception:
|
| 684 |
+
# Fallback to simple text extraction if coordinate mapping fails
|
| 685 |
+
page_texts = extract_pdf_text(pdf_path, max_pages)
|
| 686 |
+
for page_num, text in enumerate(page_texts):
|
| 687 |
+
if not text.strip():
|
| 688 |
+
continue
|
| 689 |
+
|
| 690 |
+
tokens = _extract_tokens(text)
|
| 691 |
+
misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
|
| 692 |
+
|
| 693 |
+
if misspelled_words:
|
| 694 |
+
# Create a placeholder box for the page
|
| 695 |
+
boxes.append(Box(
|
| 696 |
+
y1=page_num * 1000,
|
| 697 |
+
x1=0,
|
| 698 |
+
y2=(page_num + 1) * 1000,
|
| 699 |
+
x2=800,
|
| 700 |
+
area=800 * 1000
|
| 701 |
+
))
|
| 702 |
+
|
| 703 |
+
return boxes
|
| 704 |
+
|
| 705 |
+
def find_misspell_boxes(
|
| 706 |
+
img: Image.Image,
|
| 707 |
+
*,
|
| 708 |
+
min_conf: int = 60,
|
| 709 |
+
lang: Optional[str] = None,
|
| 710 |
+
extra_allow: Optional[Iterable[str]] = None,
|
| 711 |
+
dpi: int = 300,
|
| 712 |
+
psm: int = 6,
|
| 713 |
+
oem: int = 3
|
| 714 |
+
) -> List[Box]:
|
| 715 |
+
"""Legacy OCR-based spell checking (kept for fallback)"""
|
| 716 |
+
if not (HAS_OCR and HAS_SPELLCHECK):
|
| 717 |
+
return []
|
| 718 |
+
|
| 719 |
+
# Auto-detect language if not provided
|
| 720 |
+
if lang is None:
|
| 721 |
+
try:
|
| 722 |
+
avail = set(pytesseract.get_languages(config="") or [])
|
| 723 |
+
except Exception:
|
| 724 |
+
avail = {"eng"}
|
| 725 |
+
lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
|
| 726 |
+
|
| 727 |
+
# OPTIONAL: light upscale if the image is small (heuristic)
|
| 728 |
+
# target width ~ 2500β3000 px for letter-sized pages
|
| 729 |
+
if img.width < 1600:
|
| 730 |
+
scale = 2
|
| 731 |
+
img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
|
| 732 |
+
|
| 733 |
+
# Prepare image for better OCR
|
| 734 |
+
img = prepare_for_ocr(img)
|
| 735 |
+
|
| 736 |
+
try:
|
| 737 |
+
if extra_allow and _SPELL_EN:
|
| 738 |
+
_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 739 |
+
if extra_allow and _SPELL_FR:
|
| 740 |
+
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 741 |
+
|
| 742 |
+
# Build a config that sets an explicit DPI and keeps spaces
|
| 743 |
+
config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
|
| 744 |
+
|
| 745 |
+
data = pytesseract.image_to_data(
|
| 746 |
+
img,
|
| 747 |
+
lang=lang,
|
| 748 |
+
config=config,
|
| 749 |
+
output_type=pytesseract.Output.DICT,
|
| 750 |
+
)
|
| 751 |
+
except Exception:
|
| 752 |
+
return []
|
| 753 |
+
|
| 754 |
+
n = len(data.get("text", [])) or 0
|
| 755 |
+
boxes: List[Box] = []
|
| 756 |
+
|
| 757 |
+
for i in range(n):
|
| 758 |
+
raw = data["text"][i]
|
| 759 |
+
if not raw:
|
| 760 |
+
continue
|
| 761 |
+
|
| 762 |
+
# confidence filter
|
| 763 |
+
conf_str = data.get("conf", ["-1"])[i]
|
| 764 |
+
try:
|
| 765 |
+
conf = int(float(conf_str))
|
| 766 |
+
except Exception:
|
| 767 |
+
conf = -1
|
| 768 |
+
if conf < min_conf:
|
| 769 |
+
continue
|
| 770 |
+
|
| 771 |
+
tokens = _extract_tokens(raw)
|
| 772 |
+
if not tokens:
|
| 773 |
+
continue
|
| 774 |
+
|
| 775 |
+
# flag the box if ANY token in it looks misspelled
|
| 776 |
+
if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
|
| 777 |
+
continue
|
| 778 |
+
|
| 779 |
+
left = data.get("left", [0])[i]
|
| 780 |
+
top = data.get("top", [0])[i]
|
| 781 |
+
width = data.get("width", [0])[i]
|
| 782 |
+
height = data.get("height",[0])[i]
|
| 783 |
+
if width <= 0 or height <= 0:
|
| 784 |
+
continue
|
| 785 |
+
|
| 786 |
+
# NOTE: adjust to match your Box constructor if needed
|
| 787 |
+
b = Box(top, left, top + height, left + width, width * height)
|
| 788 |
+
# Exclude bottom 115mm unless the text contains the validation phrase
|
| 789 |
+
if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw):
|
| 790 |
+
continue
|
| 791 |
+
boxes.append(b)
|
| 792 |
+
|
| 793 |
+
return boxes
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
|
| 799 |
+
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
# deps: pip install zxing-cpp pyzbar pylibdmtx PyMuPDF pillow opencv-python-headless regex
|
| 803 |
+
# system: macOS -> brew install zbar poppler ; Ubuntu -> sudo apt-get install libzbar0 poppler-utils
|
| 804 |
+
|
| 805 |
+
import io, regex as re
|
| 806 |
+
from typing import List, Tuple, Dict, Any
|
| 807 |
+
from PIL import Image, ImageOps
|
| 808 |
+
import numpy as np
|
| 809 |
+
|
| 810 |
+
import fitz # PyMuPDF
|
| 811 |
+
|
| 812 |
+
# Optional backends
|
| 813 |
+
try:
|
| 814 |
+
import zxingcpp; HAS_ZXING=True
|
| 815 |
+
except Exception: HAS_ZXING=False
|
| 816 |
+
try:
|
| 817 |
+
from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol; HAS_ZBAR=True
|
| 818 |
+
except Exception: HAS_ZBAR=False; ZBarSymbol=None
|
| 819 |
+
try:
|
| 820 |
+
from pylibdmtx.pylibdmtx import decode as dmtx_decode; HAS_DMTX=True
|
| 821 |
+
except Exception: HAS_DMTX=False
|
| 822 |
+
try:
|
| 823 |
+
import cv2; HAS_CV2=True
|
| 824 |
+
except Exception: HAS_CV2=False
|
| 825 |
+
|
| 826 |
+
# your Box(y1,x1,y2,x2,area) assumed to exist
|
| 827 |
+
|
| 828 |
+
def _binarize(img: Image.Image) -> Image.Image:
|
| 829 |
+
g = ImageOps.grayscale(img)
|
| 830 |
+
g = ImageOps.autocontrast(g)
|
| 831 |
+
return g.point(lambda x: 255 if x > 140 else 0, mode="1").convert("L")
|
| 832 |
+
|
| 833 |
+
def _ean_checksum_ok(d: str) -> bool:
|
| 834 |
+
if not d.isdigit(): return False
|
| 835 |
+
n=len(d); nums=list(map(int,d))
|
| 836 |
+
if n==8:
|
| 837 |
+
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(7))%10))%10==nums[7]
|
| 838 |
+
if n==12:
|
| 839 |
+
return (10 - (sum(nums[i]*(3 if i%2==0 else 1) for i in range(11))%10))%10==nums[11]
|
| 840 |
+
if n==13:
|
| 841 |
+
return (10 - (sum(nums[i]*(1 if i%2==0 else 3) for i in range(12))%10))%10==nums[12]
|
| 842 |
+
return True
|
| 843 |
+
|
| 844 |
+
def _normalize_upc_ean(sym: str, text: str):
|
| 845 |
+
digits = re.sub(r"\D","",text or "")
|
| 846 |
+
s = (sym or "").upper()
|
| 847 |
+
if s in ("EAN13","EAN-13") and len(digits)==13 and digits.startswith("0"):
|
| 848 |
+
return "UPCA", digits[1:]
|
| 849 |
+
return s, (digits if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else text or "")
|
| 850 |
+
|
| 851 |
+
def _validate(sym: str, payload: str) -> bool:
|
| 852 |
+
s, norm = _normalize_upc_ean(sym, payload)
|
| 853 |
+
return _ean_checksum_ok(norm) if s in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A") else bool(payload)
|
| 854 |
+
|
| 855 |
+
def _decode_zxing(pil: Image.Image) -> List[Dict[str,Any]]:
|
| 856 |
+
if not HAS_ZXING: return []
|
| 857 |
+
arr = np.asarray(pil.convert("L"))
|
| 858 |
+
out=[]
|
| 859 |
+
for r in zxingcpp.read_barcodes(arr): # try_harder is default True in recent builds; otherwise supply options
|
| 860 |
+
# zxingcpp.Position may be iterable (sequence of points) or an object with corner attributes
|
| 861 |
+
x1=y1=x2=y2=w=h=0
|
| 862 |
+
pos = getattr(r, "position", None)
|
| 863 |
+
pts: List[Any] = []
|
| 864 |
+
if pos is not None:
|
| 865 |
+
try:
|
| 866 |
+
pts = list(pos) # works if iterable
|
| 867 |
+
except TypeError:
|
| 868 |
+
# Fall back to known corner attribute names across versions
|
| 869 |
+
corner_names = (
|
| 870 |
+
"top_left", "topLeft",
|
| 871 |
+
"top_right", "topRight",
|
| 872 |
+
"bottom_left", "bottomLeft",
|
| 873 |
+
"bottom_right", "bottomRight",
|
| 874 |
+
"point1", "point2", "point3", "point4",
|
| 875 |
+
)
|
| 876 |
+
seen=set()
|
| 877 |
+
for name in corner_names:
|
| 878 |
+
if hasattr(pos, name):
|
| 879 |
+
p = getattr(pos, name)
|
| 880 |
+
# avoid duplicates
|
| 881 |
+
if id(p) not in seen and hasattr(p, "x") and hasattr(p, "y"):
|
| 882 |
+
pts.append(p)
|
| 883 |
+
seen.add(id(p))
|
| 884 |
+
if pts:
|
| 885 |
+
xs=[int(getattr(p, "x", 0)) for p in pts]
|
| 886 |
+
ys=[int(getattr(p, "y", 0)) for p in pts]
|
| 887 |
+
x1,x2=min(xs),max(xs); y1,y2=min(ys),max(ys)
|
| 888 |
+
w,h=x2-x1,y2-y1
|
| 889 |
+
out.append({
|
| 890 |
+
"type": str(r.format),
|
| 891 |
+
"data": r.text or "",
|
| 892 |
+
"left": x1,
|
| 893 |
+
"top": y1,
|
| 894 |
+
"width": w,
|
| 895 |
+
"height": h,
|
| 896 |
+
})
|
| 897 |
+
return out
|
| 898 |
+
|
| 899 |
+
def _decode_zbar(pil: Image.Image) -> List[Dict[str,Any]]:
|
| 900 |
+
if not HAS_ZBAR: return []
|
| 901 |
+
syms=[ZBarSymbol.QRCODE,ZBarSymbol.EAN13,ZBarSymbol.EAN8,ZBarSymbol.UPCA,ZBarSymbol.CODE128] if ZBarSymbol else None
|
| 902 |
+
res=zbar_decode(pil, symbols=syms) if syms else zbar_decode(pil)
|
| 903 |
+
return [{"type": d.type, "data": (d.data.decode("utf-8","ignore") if isinstance(d.data,(bytes,bytearray)) else str(d.data)),
|
| 904 |
+
"left": d.rect.left, "top": d.rect.top, "width": d.rect.width, "height": d.rect.height} for d in res]
|
| 905 |
+
|
| 906 |
+
def _decode_dmtx(pil: Image.Image) -> List[Dict[str,Any]]:
|
| 907 |
+
if not HAS_DMTX: return []
|
| 908 |
+
try:
|
| 909 |
+
res=dmtx_decode(ImageOps.grayscale(pil))
|
| 910 |
+
return [{"type":"DATAMATRIX","data": r.data.decode("utf-8","ignore"),
|
| 911 |
+
"left": r.rect.left, "top": r.rect.top, "width": r.rect.width, "height": r.rect.height} for r in res]
|
| 912 |
+
except Exception:
|
| 913 |
+
return []
|
| 914 |
+
|
| 915 |
+
def _decode_cv2_qr(pil: Image.Image) -> List[Dict[str,Any]]:
|
| 916 |
+
if not HAS_CV2: return []
|
| 917 |
+
try:
|
| 918 |
+
det=cv2.QRCodeDetector()
|
| 919 |
+
g=np.asarray(pil.convert("L"))
|
| 920 |
+
val, pts, _ = det.detectAndDecode(g)
|
| 921 |
+
if val:
|
| 922 |
+
if pts is not None and len(pts)>=1:
|
| 923 |
+
pts=pts.reshape(-1,2); xs,ys=pts[:,0],pts[:,1]
|
| 924 |
+
x1,x2=int(xs.min()),int(xs.max()); y1,y2=int(ys.min()),int(ys.max())
|
| 925 |
+
w,h=x2-x1,y2-y1
|
| 926 |
+
else:
|
| 927 |
+
x1=y1=w=h=0
|
| 928 |
+
return [{"type":"QRCODE","data":val,"left":x1,"top":y1,"width":w,"height":h}]
|
| 929 |
+
except Exception:
|
| 930 |
+
pass
|
| 931 |
+
return []
|
| 932 |
+
|
| 933 |
+
def _decode_variants(pil: Image.Image) -> List[Dict[str,Any]]:
|
| 934 |
+
variants=[pil, ImageOps.grayscale(pil), _binarize(pil)]
|
| 935 |
+
# upsample small images with NEAREST to keep bars crisp
|
| 936 |
+
w,h=pil.size
|
| 937 |
+
if max(w,h)<1600:
|
| 938 |
+
up=pil.resize((w*2,h*2), resample=Image.NEAREST)
|
| 939 |
+
variants += [up, _binarize(up)]
|
| 940 |
+
for v in variants:
|
| 941 |
+
# ZXing first (broad coverage), then ZBar, then DMTX, then cv2 QR
|
| 942 |
+
res = _decode_zxing(v)
|
| 943 |
+
if res: return res
|
| 944 |
+
res = _decode_zbar(v)
|
| 945 |
+
if res: return res
|
| 946 |
+
res = _decode_dmtx(v)
|
| 947 |
+
if res: return res
|
| 948 |
+
res = _decode_cv2_qr(v)
|
| 949 |
+
if res: return res
|
| 950 |
+
# try rotations
|
| 951 |
+
for angle in (90,180,270):
|
| 952 |
+
r=v.rotate(angle, expand=True)
|
| 953 |
+
res = _decode_zxing(r) or _decode_zbar(r) or _decode_dmtx(r) or _decode_cv2_qr(r)
|
| 954 |
+
if res: return res
|
| 955 |
+
return []
|
| 956 |
+
|
| 957 |
+
def _pix_to_pil(pix) -> Image.Image:
|
| 958 |
+
# convert PyMuPDF Pixmap to grayscale PIL without alpha (avoids blur)
|
| 959 |
+
if pix.alpha: pix = fitz.Pixmap(pix, 0)
|
| 960 |
+
try:
|
| 961 |
+
pix = fitz.Pixmap(fitz.csGRAY, pix)
|
| 962 |
+
except Exception:
|
| 963 |
+
pass
|
| 964 |
+
return Image.open(io.BytesIO(pix.tobytes("png")))
|
| 965 |
+
|
| 966 |
+
def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
|
| 967 |
+
"""Return (boxes, infos) from both rendered pages and embedded images."""
|
| 968 |
+
boxes=[]; infos=[]
|
| 969 |
+
doc=fitz.open(pdf_path)
|
| 970 |
+
n=min(len(doc), max_pages)
|
| 971 |
+
for page_idx in range(n):
|
| 972 |
+
page=doc[page_idx]
|
| 973 |
+
|
| 974 |
+
# A) Embedded images (often crisp)
|
| 975 |
+
for ix,(xref,*_) in enumerate(page.get_images(full=True)):
|
| 976 |
+
try:
|
| 977 |
+
pix=fitz.Pixmap(doc, xref)
|
| 978 |
+
pil=_pix_to_pil(pix)
|
| 979 |
+
hits=_decode_variants(pil)
|
| 980 |
+
for r in hits:
|
| 981 |
+
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
|
| 982 |
+
# Exclude barcodes in the bottom 115mm of the page image
|
| 983 |
+
if _is_in_excluded_bottom_area(b, pil.height):
|
| 984 |
+
continue
|
| 985 |
+
boxes.append(b)
|
| 986 |
+
sym, payload = r["type"], r["data"]
|
| 987 |
+
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
|
| 988 |
+
except Exception:
|
| 989 |
+
pass
|
| 990 |
+
|
| 991 |
+
# B) Render page raster at high DPI (grayscale)
|
| 992 |
+
for dpi in dpi_list:
|
| 993 |
+
scale=dpi/72.0
|
| 994 |
+
try:
|
| 995 |
+
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), colorspace=fitz.csGRAY, alpha=False)
|
| 996 |
+
except TypeError:
|
| 997 |
+
pix=page.get_pixmap(matrix=fitz.Matrix(scale,scale), alpha=False)
|
| 998 |
+
pil=_pix_to_pil(pix)
|
| 999 |
+
hits=_decode_variants(pil)
|
| 1000 |
+
for r in hits:
|
| 1001 |
+
b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
|
| 1002 |
+
if _is_in_excluded_bottom_area(b, pil.height):
|
| 1003 |
+
continue
|
| 1004 |
+
boxes.append(b)
|
| 1005 |
+
sym, payload = r["type"], r["data"]
|
| 1006 |
+
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
|
| 1007 |
+
if any(i["page"]==page_idx+1 for i in infos):
|
| 1008 |
+
break # found something for this page β next page
|
| 1009 |
+
doc.close()
|
| 1010 |
+
return boxes, infos
|
| 1011 |
+
|
| 1012 |
+
|
| 1013 |
+
|
| 1014 |
+
|
| 1015 |
+
# -------------------- CMYK Panel -------------------
|
| 1016 |
+
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
|
| 1017 |
+
return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255
|
| 1018 |
+
|
| 1019 |
+
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
|
| 1020 |
+
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
|
| 1021 |
+
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
|
| 1022 |
+
if y2<=y1 or x2<=x1:
|
| 1023 |
+
return (0.0,0.0,0.0,0.0)
|
| 1024 |
+
region = cmyk_arr[y1:y2, x1:x2, :]
|
| 1025 |
+
mean_vals = region.reshape(-1, 4).mean(axis=0)
|
| 1026 |
+
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
|
| 1027 |
+
|
| 1028 |
+
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
|
| 1029 |
+
a_cmyk = rgb_to_cmyk_array(a_img)
|
| 1030 |
+
b_cmyk = rgb_to_cmyk_array(b_img)
|
| 1031 |
+
entries = []
|
| 1032 |
+
for i, bx in enumerate(red_boxes):
|
| 1033 |
+
a_vals = avg_cmyk_in_box(a_cmyk, bx)
|
| 1034 |
+
b_vals = avg_cmyk_in_box(b_cmyk, bx)
|
| 1035 |
+
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
|
| 1036 |
+
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
|
| 1037 |
+
return entries
|
| 1038 |
+
|
| 1039 |
+
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
|
| 1040 |
+
w,h = base.size
|
| 1041 |
+
panel = Image.new('RGB', (panel_width, h), (245,245,245))
|
| 1042 |
+
out = Image.new('RGB', (w+panel_width, h), (255,255,255))
|
| 1043 |
+
out.paste(base, (0,0)); out.paste(panel, (w,0))
|
| 1044 |
+
d = ImageDraw.Draw(out)
|
| 1045 |
+
x0 = w + 8; y = 8
|
| 1046 |
+
d.text((x0, y), title, fill=(0,0,0)); y += 18
|
| 1047 |
+
if not entries:
|
| 1048 |
+
d.text((x0, y), 'No differing regions', fill=(80,80,80))
|
| 1049 |
+
return out
|
| 1050 |
+
for e in entries:
|
| 1051 |
+
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
|
| 1052 |
+
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
|
| 1053 |
+
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
|
| 1054 |
+
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
|
| 1055 |
+
d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
|
| 1056 |
+
if y > h - 40: break
|
| 1057 |
+
return out
|
| 1058 |
+
|
| 1059 |
+
# -------------------- Gradio Interface -----------------
|
| 1060 |
+
def compare_pdfs(file_a, file_b):
|
| 1061 |
+
"""Main comparison function for Gradio interface"""
|
| 1062 |
+
try:
|
| 1063 |
+
if file_a is None or file_b is None:
|
| 1064 |
+
return None, None, None, "β Please upload both PDF files to compare", [], []
|
| 1065 |
+
|
| 1066 |
+
# Load images with multiple pages support
|
| 1067 |
+
pages_a = load_pdf_pages(file_a.name, dpi=600, max_pages=15)
|
| 1068 |
+
pages_b = load_pdf_pages(file_b.name, dpi=600, max_pages=15)
|
| 1069 |
+
|
| 1070 |
+
# Combine pages into single images for comparison
|
| 1071 |
+
a = combine_pages_vertically(pages_a)
|
| 1072 |
+
b = combine_pages_vertically(pages_b)
|
| 1073 |
+
|
| 1074 |
+
# Match sizes
|
| 1075 |
+
a, b = match_sizes(a, b)
|
| 1076 |
+
|
| 1077 |
+
# Find differences with default settings
|
| 1078 |
+
diff = difference_map(a, b)
|
| 1079 |
+
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
|
| 1080 |
+
|
| 1081 |
+
# Run all analysis features with defaults
|
| 1082 |
+
# Use text-based spell checking instead of OCR for better accuracy
|
| 1083 |
+
# Pass image dimensions for proper coordinate mapping
|
| 1084 |
+
image_size = (a.width, a.height)
|
| 1085 |
+
misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
|
| 1086 |
+
misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
|
| 1087 |
+
|
| 1088 |
+
# Debug: Print spell check results
|
| 1089 |
+
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
|
| 1090 |
+
|
| 1091 |
+
if HAS_BARCODE:
|
| 1092 |
+
# Use PDF-based barcode detection instead of rasterized image
|
| 1093 |
+
bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
|
| 1094 |
+
bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
|
| 1095 |
+
|
| 1096 |
+
# Debug: Print barcode detection results
|
| 1097 |
+
print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
|
| 1098 |
+
else:
|
| 1099 |
+
bar_a, info_a = [], []
|
| 1100 |
+
bar_b, info_b = [], []
|
| 1101 |
+
|
| 1102 |
+
# Always enable CMYK analysis
|
| 1103 |
+
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
|
| 1104 |
+
|
| 1105 |
+
# Create visualizations with default box width
|
| 1106 |
+
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3)
|
| 1107 |
+
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3)
|
| 1108 |
+
|
| 1109 |
+
# Always show CMYK panel
|
| 1110 |
+
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
|
| 1111 |
+
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
|
| 1112 |
+
|
| 1113 |
+
# Create pixel difference overlay
|
| 1114 |
+
overlay = make_red_overlay(a, b)
|
| 1115 |
+
|
| 1116 |
+
# Create status message
|
| 1117 |
+
status = f"""
|
| 1118 |
+
π **Analysis Complete!**
|
| 1119 |
+
- **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
|
| 1120 |
+
- **Difference regions found:** {len(red_boxes)}
|
| 1121 |
+
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
|
| 1122 |
+
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
|
| 1123 |
+
- **Combined image dimensions:** {a.width} Γ {a.height} pixels
|
| 1124 |
+
|
| 1125 |
+
**Legend:**
|
| 1126 |
+
- π΄ Red boxes: Visual differences
|
| 1127 |
+
- π΅ Cyan boxes: Spelling errors
|
| 1128 |
+
- π’ Green boxes: Barcodes/QR codes
|
| 1129 |
+
"""
|
| 1130 |
+
|
| 1131 |
+
# Prepare barcode data for tables
|
| 1132 |
+
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
|
| 1133 |
+
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
|
| 1134 |
+
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
|
| 1135 |
+
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
|
| 1136 |
+
|
| 1137 |
+
return overlay, a_disp, b_disp, status, codes_a, codes_b
|
| 1138 |
+
|
| 1139 |
+
except Exception as e:
|
| 1140 |
+
error_msg = f"β **Error:** {str(e)}"
|
| 1141 |
+
return None, None, None, error_msg, [], []
|
| 1142 |
+
|
| 1143 |
+
# -------------------- Gradio App -------------------
|
| 1144 |
+
def create_demo():
|
| 1145 |
+
# Create custom theme with light blue background
|
| 1146 |
+
# Create a simple, working theme with supported parameters only
|
| 1147 |
+
custom_theme = gr.themes.Soft(
|
| 1148 |
+
primary_hue="blue",
|
| 1149 |
+
neutral_hue="blue",
|
| 1150 |
+
font=gr.themes.GoogleFont("Inter"),
|
| 1151 |
+
).set(
|
| 1152 |
+
body_background_fill="#99cfe9", # Light blue background
|
| 1153 |
+
body_background_fill_dark="#99cfe9",
|
| 1154 |
+
block_background_fill="#000000", # Black blocks for contrast
|
| 1155 |
+
block_background_fill_dark="#000000",
|
| 1156 |
+
border_color_primary="#333333", # Dark borders
|
| 1157 |
+
border_color_primary_dark="#333333",
|
| 1158 |
+
)
|
| 1159 |
+
|
| 1160 |
+
with gr.Blocks(title="PDF Comparison Tool", theme=custom_theme) as demo:
|
| 1161 |
+
gr.Markdown("""
|
| 1162 |
+
# π Advanced PDF Comparison Tool
|
| 1163 |
+
|
| 1164 |
+
Upload two PDF files to get comprehensive analysis including:
|
| 1165 |
+
- **Multi-page PDF support** (up to 15 pages per document)
|
| 1166 |
+
- **Visual differences** with bounding boxes
|
| 1167 |
+
- **OCR and spell checking**
|
| 1168 |
+
- **Barcode/QR code detection**
|
| 1169 |
+
- **CMYK color analysis**
|
| 1170 |
+
""")
|
| 1171 |
+
|
| 1172 |
+
with gr.Row():
|
| 1173 |
+
with gr.Column():
|
| 1174 |
+
file_a = gr.File(label="π PDF A (Reference)", file_types=[".pdf"])
|
| 1175 |
+
file_b = gr.File(label="π PDF B (Comparison)", file_types=[".pdf"])
|
| 1176 |
+
|
| 1177 |
+
compare_btn = gr.Button("π Compare PDF Files", variant="primary", size="lg")
|
| 1178 |
+
|
| 1179 |
+
status_md = gr.Markdown("")
|
| 1180 |
+
|
| 1181 |
+
with gr.Row():
|
| 1182 |
+
overlay_img = gr.Image(label="π΄ Pixel Differences (Red = Different)", type="pil")
|
| 1183 |
+
|
| 1184 |
+
with gr.Row():
|
| 1185 |
+
img_a = gr.Image(label="π File A with Analysis", type="pil")
|
| 1186 |
+
img_b = gr.Image(label="π File B with Analysis", type="pil")
|
| 1187 |
+
|
| 1188 |
+
gr.Markdown("### π Barcode Detection Results")
|
| 1189 |
+
with gr.Row():
|
| 1190 |
+
codes_a_df = gr.Dataframe(
|
| 1191 |
+
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
|
| 1192 |
+
label="Barcodes in File A",
|
| 1193 |
+
interactive=False
|
| 1194 |
+
)
|
| 1195 |
+
codes_b_df = gr.Dataframe(
|
| 1196 |
+
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
|
| 1197 |
+
label="Barcodes in File B",
|
| 1198 |
+
interactive=False
|
| 1199 |
+
)
|
| 1200 |
+
|
| 1201 |
+
# Event handlers
|
| 1202 |
+
compare_btn.click(
|
| 1203 |
+
fn=compare_pdfs,
|
| 1204 |
+
inputs=[file_a, file_b],
|
| 1205 |
+
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
|
| 1206 |
+
)
|
| 1207 |
+
|
| 1208 |
+
gr.Markdown("""
|
| 1209 |
+
### π Instructions:
|
| 1210 |
+
1. Upload two PDF files
|
| 1211 |
+
2. Click "Compare PDF Files"
|
| 1212 |
+
3. View results with comprehensive analysis
|
| 1213 |
+
|
| 1214 |
+
### π¨ Color Legend:
|
| 1215 |
+
- **π΄ Red boxes:** Visual differences between files
|
| 1216 |
+
- **π΅ Cyan boxes:** Potential spelling errors (OCR)
|
| 1217 |
+
- **π’ Green boxes:** Detected barcodes/QR codes
|
| 1218 |
+
- **π Side panel:** CMYK color analysis for print workflows
|
| 1219 |
+
""")
|
| 1220 |
+
|
| 1221 |
+
return demo
|
| 1222 |
+
|
| 1223 |
+
def _binarize(pil_img: Image.Image) -> Image.Image:
|
| 1224 |
+
"""Create a binarized (black/white) version of the image for better barcode detection"""
|
| 1225 |
+
g = ImageOps.grayscale(pil_img)
|
| 1226 |
+
g = ImageOps.autocontrast(g)
|
| 1227 |
+
return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')
|
| 1228 |
+
|
| 1229 |
+
def _decode_once(img: Image.Image):
|
| 1230 |
+
"""Single decode attempt with common barcode symbols"""
|
| 1231 |
+
if not HAS_BARCODE:
|
| 1232 |
+
return []
|
| 1233 |
+
syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128]
|
| 1234 |
+
return zbar_decode(img, symbols=syms)
|
| 1235 |
+
|
| 1236 |
+
def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
|
| 1237 |
+
"""
|
| 1238 |
+
Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues.
|
| 1239 |
+
|
| 1240 |
+
This function:
|
| 1241 |
+
- Renders pages at 600/900/1200 DPI
|
| 1242 |
+
- Tries grayscale, binarized, and rotated versions
|
| 1243 |
+
- Scans embedded images (XObjects)
|
| 1244 |
+
- Prints what it finds and writes debug PNGs
|
| 1245 |
+
- Helps identify if barcodes are too thin/low resolution
|
| 1246 |
+
|
| 1247 |
+
Usage:
|
| 1248 |
+
debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2)
|
| 1249 |
+
"""
|
| 1250 |
+
if not (HAS_BARCODE and HAS_PYMUPDF):
|
| 1251 |
+
print("ERROR: Missing dependencies (pyzbar or PyMuPDF)")
|
| 1252 |
+
return
|
| 1253 |
+
|
| 1254 |
+
os.makedirs(outdir, exist_ok=True)
|
| 1255 |
+
doc = fitz.open(pdf_path)
|
| 1256 |
+
|
| 1257 |
+
for dpi in (600, 900, 1200):
|
| 1258 |
+
scale = dpi / 72.0
|
| 1259 |
+
mat = fitz.Matrix(scale, scale)
|
| 1260 |
+
print(f"\n=== DPI {dpi} ===")
|
| 1261 |
+
|
| 1262 |
+
for p in range(min(len(doc), max_pages)):
|
| 1263 |
+
page = doc[p]
|
| 1264 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 1265 |
+
img = Image.open(io.BytesIO(pix.tobytes("ppm")))
|
| 1266 |
+
img.save(f"{outdir}/page{p+1}_{dpi}.png")
|
| 1267 |
+
|
| 1268 |
+
# Try different image variants
|
| 1269 |
+
variants = [
|
| 1270 |
+
("orig", img),
|
| 1271 |
+
("gray", ImageOps.grayscale(img)),
|
| 1272 |
+
("bin", _binarize(img)),
|
| 1273 |
+
]
|
| 1274 |
+
found = []
|
| 1275 |
+
|
| 1276 |
+
for tag, v in variants:
|
| 1277 |
+
r = _decode_once(v)
|
| 1278 |
+
if r:
|
| 1279 |
+
found.extend((tag, rr.type, rr.data) for rr in r)
|
| 1280 |
+
else:
|
| 1281 |
+
# Try rotations
|
| 1282 |
+
for angle in (90, 180, 270):
|
| 1283 |
+
rr = _decode_once(v.rotate(angle, expand=True))
|
| 1284 |
+
if rr:
|
| 1285 |
+
found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr)
|
| 1286 |
+
break
|
| 1287 |
+
|
| 1288 |
+
print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}")
|
| 1289 |
+
|
| 1290 |
+
# Scan embedded images too
|
| 1291 |
+
imgs = page.get_images(full=True)
|
| 1292 |
+
for ix, (xref, *_) in enumerate(imgs):
|
| 1293 |
+
try:
|
| 1294 |
+
ipix = fitz.Pixmap(doc, xref)
|
| 1295 |
+
if ipix.alpha:
|
| 1296 |
+
ipix = fitz.Pixmap(ipix, 0)
|
| 1297 |
+
pil = Image.open(io.BytesIO(ipix.tobytes("ppm")))
|
| 1298 |
+
pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png")
|
| 1299 |
+
rr = _decode_once(pil) or _decode_once(_binarize(pil))
|
| 1300 |
+
if rr:
|
| 1301 |
+
print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
|
| 1302 |
+
except Exception as e:
|
| 1303 |
+
print(" Embedded image error:", e)
|
| 1304 |
+
|
| 1305 |
+
doc.close()
|
| 1306 |
+
print(f"\nDebug images saved to: {outdir}/")
|
| 1307 |
+
print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
|
| 1308 |
+
|
| 1309 |
+
def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
|
| 1310 |
+
"""Detect barcodes from the original PDF and return boxes in the same
|
| 1311 |
+
coordinate space as the combined display image.
|
| 1312 |
+
|
| 1313 |
+
If image_size is provided (w,h of the vertically combined display image),
|
| 1314 |
+
each page is rendered so its width matches w, then decoded. Box y-coordinates
|
| 1315 |
+
are offset by the cumulative height of previous pages so that all boxes map
|
| 1316 |
+
into the combined image space correctly.
|
| 1317 |
+
"""
|
| 1318 |
+
boxes: List[Box] = []
|
| 1319 |
+
infos: List[Dict[str, Any]] = []
|
| 1320 |
+
try:
|
| 1321 |
+
doc = fitz.open(pdf_path)
|
| 1322 |
+
num_pages = min(len(doc), max_pages)
|
| 1323 |
+
if num_pages == 0:
|
| 1324 |
+
return [], []
|
| 1325 |
+
|
| 1326 |
+
target_width = None
|
| 1327 |
+
if image_size:
|
| 1328 |
+
target_width = int(image_size[0])
|
| 1329 |
+
|
| 1330 |
+
y_offset = 0
|
| 1331 |
+
for page_idx in range(num_pages):
|
| 1332 |
+
page = doc[page_idx]
|
| 1333 |
+
# Compute scale so that rendered width matches target_width when provided
|
| 1334 |
+
if target_width:
|
| 1335 |
+
page_width_pts = float(page.rect.width) # points (72 dpi)
|
| 1336 |
+
scale = max(1.0, target_width / page_width_pts)
|
| 1337 |
+
else:
|
| 1338 |
+
# fallback dpi ~600
|
| 1339 |
+
scale = 600.0 / 72.0
|
| 1340 |
+
try:
|
| 1341 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
|
| 1342 |
+
except TypeError:
|
| 1343 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
|
| 1344 |
+
pil = _pix_to_pil(pix)
|
| 1345 |
+
pw, ph = pil.size
|
| 1346 |
+
hits = _decode_variants(pil)
|
| 1347 |
+
for r in hits:
|
| 1348 |
+
x1 = int(r.get("left", 0))
|
| 1349 |
+
y1 = int(r.get("top", 0)) + y_offset
|
| 1350 |
+
w = int(r.get("width", 0))
|
| 1351 |
+
h = int(r.get("height", 0))
|
| 1352 |
+
x2 = x1 + w
|
| 1353 |
+
y2 = y1 + h
|
| 1354 |
+
b = Box(y1, x1, y2, x2, w * h)
|
| 1355 |
+
# Exclude bottom 115mm for combined image if we know full height; else per-page
|
| 1356 |
+
if image_size and _is_in_excluded_bottom_area(b, image_size[1]):
|
| 1357 |
+
continue
|
| 1358 |
+
if not image_size and _is_in_excluded_bottom_area(b, ph):
|
| 1359 |
+
continue
|
| 1360 |
+
boxes.append(b)
|
| 1361 |
+
sym, payload = r.get("type", ""), r.get("data", "")
|
| 1362 |
+
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"})
|
| 1363 |
+
y_offset += ph
|
| 1364 |
+
doc.close()
|
| 1365 |
+
except Exception:
|
| 1366 |
+
return [], []
|
| 1367 |
+
return boxes, infos
|
| 1368 |
+
|
| 1369 |
+
if __name__ == "__main__":
|
| 1370 |
+
demo = create_demo()
|
| 1371 |
+
demo.launch(
|
| 1372 |
+
server_name="0.0.0.0", # Allow external access
|
| 1373 |
+
share=True, # Set to True to create a public link
|
| 1374 |
+
show_error=True
|
| 1375 |
+
)
|