#!/usr/bin/env python3 """ extract form field data from PDFs recursively """ from pypdf import PdfReader from pathlib import Path def extract_form_fields(pdf_path): try: reader = PdfReader(pdf_path) if reader.is_encrypted: return {'file': str(pdf_path), 'error': 'File is encrypted'} fields = reader.get_fields() if fields: form_data = [] for field_name, field_info in fields.items(): if not field_info: continue field_entry = { 'name': field_name, 'value': field_info.get('/V', ''), 'default_value': field_info.get('/DV', ''), 'type': field_info.get('/FT', ''), 'flags': field_info.get('/Ff', 0), 'read_only': bool(field_info.get('/Ff', 0) & 1), 'required': bool(field_info.get('/Ff', 0) & 2), } # alternate field name if present if '/T' in field_info: field_entry['name'] = field_info['/T'] # tooltip/description if '/TU' in field_info: field_entry['tooltip'] = field_info['/TU'] form_data.append(field_entry) return { 'file': str(pdf_path), 'field_count': len(form_data), 'fields': form_data } return None except Exception as e: return { 'file': str(pdf_path), 'error': str(e) } def main(): pdf_files = Path('.').rglob('*.pdf') found_count = 0 print(f"{'SCANNING FOR FORM FIELDS':=^60}") for pdf_file in pdf_files: result = extract_form_fields(pdf_file) if result: if 'error' in result: # print(f"\nFile: {result['file']}") # print(f" [!] Error: {result['error']}") continue found_count += 1 print(f"\nFile: {result['file']}") print(f" Total Fields: {result['field_count']}") print("-" * 60) for field in result['fields']: # clean up type string (e.g. '/Tx' -> 'Tx') f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK' val = field['value'] display_val = f"'{val}'" if val else "" status = [] if field['required']: status.append("REQ") if field['read_only']: status.append("RO") status_str = f"[{', '.join(status)}]" if status else "" print(f" [{f_type:<3}] {field['name']}") print(f" Value: {display_val}") if status_str: print(f" Flags: {status_str}") if field.get('tooltip'): print(f" Tip: {field['tooltip']}") print("-" * 60) print(f"\nExtraction complete. Found {found_count} PDFs with form fields.") if __name__ == '__main__': main()