Spaces:

mgbam
/

oluown

Sleeping

App Files Files Community

mgbam commited on 23 days ago

Commit

a6b3514

verified ·

1 Parent(s): 5a365cb

Upload extract_form_fields.py

Browse files

Files changed (1) hide show

skills/pdf/tools/extract_form_fields.py +54 -0

skills/pdf/tools/extract_form_fields.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import Dict, Any, List
+def extract_fields(pdf_path: str) -> Dict[str, Any]:
+    """
+    Attempt to extract PDF form fields.
+    Primary path uses PyPDF2.get_fields(); fallback scans annotations.
+    """
+    try:
+        from PyPDF2 import PdfReader
+    except Exception as e:
+        return {"ok": False, "error": f"PyPDF2 not available: {e}"}
+    try:
+        reader = PdfReader(pdf_path)
+    except Exception as e:
+        return {"ok": False, "error": f"Unable to read PDF: {e}"}
+    # Try official get_fields(), available in many PyPDF2 versions
+    fields_out: List[Dict[str, Any]] = []
+    try:
+        fields = reader.get_fields()  # type: ignore[attr-defined]
+        if fields:
+            for k, v in fields.items():
+                val = None
+                if isinstance(v, dict):
+                    val = v.get("/V")
+                try:
+                    name = k if isinstance(k, str) else str(k)
+                except Exception:
+                    name = str(k)
+                fields_out.append({"name": name, "value": val})
+            return {"ok": True, "fields": fields_out, "method": "get_fields"}
+    except Exception:
+        pass
+    # Fallback: scan annotations for Widget entries
+    try:
+        for i, page in enumerate(reader.pages):
+            try:
+                annots = page.get("/Annots", [])
+            except Exception:
+                annots = []
+            for annot_ref in annots or []:
+                try:
+                    annot = annot_ref.get_object()
+                    subtype = annot.get("/Subtype")
+                    if getattr(subtype, "name", None) == "Widget":
+                        name = annot.get("/T")
+                        value = annot.get("/V")
+                        fields_out.append({"page": i+1, "name": name, "value": value})
+                except Exception:
+                    continue
+        return {"ok": True, "fields": fields_out, "method": "annots"}
+    except Exception as e:
+        return {"ok": False, "error": f"Fallback failed: {e}"}