pdf-forensics/extract_form_fields.py
2026-01-17 20:18:15 +00:00

103 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""
extract form field data from PDFs recursively
"""
from pypdf import PdfReader
from pathlib import Path
def extract_form_fields(pdf_path):
try:
reader = PdfReader(pdf_path)
if reader.is_encrypted:
return {'file': str(pdf_path), 'error': 'File is encrypted'}
fields = reader.get_fields()
if fields:
form_data = []
for field_name, field_info in fields.items():
if not field_info:
continue
field_entry = {
'name': field_name,
'value': field_info.get('/V', ''),
'default_value': field_info.get('/DV', ''),
'type': field_info.get('/FT', ''),
'flags': field_info.get('/Ff', 0),
'read_only': bool(field_info.get('/Ff', 0) & 1),
'required': bool(field_info.get('/Ff', 0) & 2),
}
# alternate field name if present
if '/T' in field_info:
field_entry['name'] = field_info['/T']
# tooltip/description
if '/TU' in field_info:
field_entry['tooltip'] = field_info['/TU']
form_data.append(field_entry)
return {
'file': str(pdf_path),
'field_count': len(form_data),
'fields': form_data
}
return None
except Exception as e:
return {
'file': str(pdf_path),
'error': str(e)
}
def main():
pdf_files = Path('.').rglob('*.pdf')
found_count = 0
print(f"{'SCANNING FOR FORM FIELDS':=^60}")
for pdf_file in pdf_files:
result = extract_form_fields(pdf_file)
if result:
if 'error' in result:
# print(f"\nFile: {result['file']}")
# print(f" [!] Error: {result['error']}")
continue
found_count += 1
print(f"\nFile: {result['file']}")
print(f" Total Fields: {result['field_count']}")
print("-" * 60)
for field in result['fields']:
# clean up type string (e.g. '/Tx' -> 'Tx')
f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
val = field['value']
display_val = f"'{val}'" if val else "<empty>"
status = []
if field['required']: status.append("REQ")
if field['read_only']: status.append("RO")
status_str = f"[{', '.join(status)}]" if status else ""
print(f" [{f_type:<3}] {field['name']}")
print(f" Value: {display_val}")
if status_str:
print(f" Flags: {status_str}")
if field.get('tooltip'):
print(f" Tip: {field['tooltip']}")
print("-" * 60)
print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
if __name__ == '__main__':
main()