103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract form field data from PDFs recursively
|
|
"""
|
|
from pypdf import PdfReader
|
|
from pathlib import Path
|
|
|
|
def extract_form_fields(pdf_path):
|
|
try:
|
|
reader = PdfReader(pdf_path)
|
|
|
|
if reader.is_encrypted:
|
|
return {'file': str(pdf_path), 'error': 'File is encrypted'}
|
|
|
|
fields = reader.get_fields()
|
|
|
|
if fields:
|
|
form_data = []
|
|
|
|
for field_name, field_info in fields.items():
|
|
if not field_info:
|
|
continue
|
|
|
|
field_entry = {
|
|
'name': field_name,
|
|
'value': field_info.get('/V', ''),
|
|
'default_value': field_info.get('/DV', ''),
|
|
'type': field_info.get('/FT', ''),
|
|
'flags': field_info.get('/Ff', 0),
|
|
'read_only': bool(field_info.get('/Ff', 0) & 1),
|
|
'required': bool(field_info.get('/Ff', 0) & 2),
|
|
}
|
|
|
|
# alternate field name if present
|
|
if '/T' in field_info:
|
|
field_entry['name'] = field_info['/T']
|
|
|
|
# tooltip/description
|
|
if '/TU' in field_info:
|
|
field_entry['tooltip'] = field_info['/TU']
|
|
|
|
form_data.append(field_entry)
|
|
|
|
return {
|
|
'file': str(pdf_path),
|
|
'field_count': len(form_data),
|
|
'fields': form_data
|
|
}
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
return {
|
|
'file': str(pdf_path),
|
|
'error': str(e)
|
|
}
|
|
|
|
def main():
|
|
pdf_files = Path('.').rglob('*.pdf')
|
|
found_count = 0
|
|
|
|
print(f"{'SCANNING FOR FORM FIELDS':=^60}")
|
|
|
|
for pdf_file in pdf_files:
|
|
result = extract_form_fields(pdf_file)
|
|
|
|
if result:
|
|
if 'error' in result:
|
|
# print(f"\nFile: {result['file']}")
|
|
# print(f" [!] Error: {result['error']}")
|
|
continue
|
|
|
|
found_count += 1
|
|
print(f"\nFile: {result['file']}")
|
|
print(f" Total Fields: {result['field_count']}")
|
|
print("-" * 60)
|
|
|
|
for field in result['fields']:
|
|
# clean up type string (e.g. '/Tx' -> 'Tx')
|
|
f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
|
|
|
|
val = field['value']
|
|
display_val = f"'{val}'" if val else "<empty>"
|
|
|
|
status = []
|
|
if field['required']: status.append("REQ")
|
|
if field['read_only']: status.append("RO")
|
|
status_str = f"[{', '.join(status)}]" if status else ""
|
|
|
|
print(f" [{f_type:<3}] {field['name']}")
|
|
print(f" Value: {display_val}")
|
|
|
|
if status_str:
|
|
print(f" Flags: {status_str}")
|
|
if field.get('tooltip'):
|
|
print(f" Tip: {field['tooltip']}")
|
|
|
|
print("-" * 60)
|
|
|
|
print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|