Upload extract_form_fields.py
Browse files
skills/pdf/tools/extract_form_fields.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any, List
|
| 2 |
+
def extract_fields(pdf_path: str) -> Dict[str, Any]:
|
| 3 |
+
"""
|
| 4 |
+
Attempt to extract PDF form fields.
|
| 5 |
+
Primary path uses PyPDF2.get_fields(); fallback scans annotations.
|
| 6 |
+
"""
|
| 7 |
+
try:
|
| 8 |
+
from PyPDF2 import PdfReader
|
| 9 |
+
except Exception as e:
|
| 10 |
+
return {"ok": False, "error": f"PyPDF2 not available: {e}"}
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
reader = PdfReader(pdf_path)
|
| 14 |
+
except Exception as e:
|
| 15 |
+
return {"ok": False, "error": f"Unable to read PDF: {e}"}
|
| 16 |
+
|
| 17 |
+
# Try official get_fields(), available in many PyPDF2 versions
|
| 18 |
+
fields_out: List[Dict[str, Any]] = []
|
| 19 |
+
try:
|
| 20 |
+
fields = reader.get_fields() # type: ignore[attr-defined]
|
| 21 |
+
if fields:
|
| 22 |
+
for k, v in fields.items():
|
| 23 |
+
val = None
|
| 24 |
+
if isinstance(v, dict):
|
| 25 |
+
val = v.get("/V")
|
| 26 |
+
try:
|
| 27 |
+
name = k if isinstance(k, str) else str(k)
|
| 28 |
+
except Exception:
|
| 29 |
+
name = str(k)
|
| 30 |
+
fields_out.append({"name": name, "value": val})
|
| 31 |
+
return {"ok": True, "fields": fields_out, "method": "get_fields"}
|
| 32 |
+
except Exception:
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
# Fallback: scan annotations for Widget entries
|
| 36 |
+
try:
|
| 37 |
+
for i, page in enumerate(reader.pages):
|
| 38 |
+
try:
|
| 39 |
+
annots = page.get("/Annots", [])
|
| 40 |
+
except Exception:
|
| 41 |
+
annots = []
|
| 42 |
+
for annot_ref in annots or []:
|
| 43 |
+
try:
|
| 44 |
+
annot = annot_ref.get_object()
|
| 45 |
+
subtype = annot.get("/Subtype")
|
| 46 |
+
if getattr(subtype, "name", None) == "Widget":
|
| 47 |
+
name = annot.get("/T")
|
| 48 |
+
value = annot.get("/V")
|
| 49 |
+
fields_out.append({"page": i+1, "name": name, "value": value})
|
| 50 |
+
except Exception:
|
| 51 |
+
continue
|
| 52 |
+
return {"ok": True, "fields": fields_out, "method": "annots"}
|
| 53 |
+
except Exception as e:
|
| 54 |
+
return {"ok": False, "error": f"Fallback failed: {e}"}
|