mgbam commited on
Commit
a6b3514
·
verified ·
1 Parent(s): 5a365cb

Upload extract_form_fields.py

Browse files
skills/pdf/tools/extract_form_fields.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List
2
+ def extract_fields(pdf_path: str) -> Dict[str, Any]:
3
+ """
4
+ Attempt to extract PDF form fields.
5
+ Primary path uses PyPDF2.get_fields(); fallback scans annotations.
6
+ """
7
+ try:
8
+ from PyPDF2 import PdfReader
9
+ except Exception as e:
10
+ return {"ok": False, "error": f"PyPDF2 not available: {e}"}
11
+
12
+ try:
13
+ reader = PdfReader(pdf_path)
14
+ except Exception as e:
15
+ return {"ok": False, "error": f"Unable to read PDF: {e}"}
16
+
17
+ # Try official get_fields(), available in many PyPDF2 versions
18
+ fields_out: List[Dict[str, Any]] = []
19
+ try:
20
+ fields = reader.get_fields() # type: ignore[attr-defined]
21
+ if fields:
22
+ for k, v in fields.items():
23
+ val = None
24
+ if isinstance(v, dict):
25
+ val = v.get("/V")
26
+ try:
27
+ name = k if isinstance(k, str) else str(k)
28
+ except Exception:
29
+ name = str(k)
30
+ fields_out.append({"name": name, "value": val})
31
+ return {"ok": True, "fields": fields_out, "method": "get_fields"}
32
+ except Exception:
33
+ pass
34
+
35
+ # Fallback: scan annotations for Widget entries
36
+ try:
37
+ for i, page in enumerate(reader.pages):
38
+ try:
39
+ annots = page.get("/Annots", [])
40
+ except Exception:
41
+ annots = []
42
+ for annot_ref in annots or []:
43
+ try:
44
+ annot = annot_ref.get_object()
45
+ subtype = annot.get("/Subtype")
46
+ if getattr(subtype, "name", None) == "Widget":
47
+ name = annot.get("/T")
48
+ value = annot.get("/V")
49
+ fields_out.append({"page": i+1, "name": name, "value": value})
50
+ except Exception:
51
+ continue
52
+ return {"ok": True, "fields": fields_out, "method": "annots"}
53
+ except Exception as e:
54
+ return {"ok": False, "error": f"Fallback failed: {e}"}