Add scripts
This commit is contained in:
commit
cc682741d5
6 changed files with 437 additions and 0 deletions
103
extract_form_fields.py
Normal file
103
extract_form_fields.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
extract form field data from PDFs recursively
|
||||
"""
|
||||
from pypdf import PdfReader
|
||||
from pathlib import Path
|
||||
|
||||
def extract_form_fields(pdf_path):
|
||||
try:
|
||||
reader = PdfReader(pdf_path)
|
||||
|
||||
if reader.is_encrypted:
|
||||
return {'file': str(pdf_path), 'error': 'File is encrypted'}
|
||||
|
||||
fields = reader.get_fields()
|
||||
|
||||
if fields:
|
||||
form_data = []
|
||||
|
||||
for field_name, field_info in fields.items():
|
||||
if not field_info:
|
||||
continue
|
||||
|
||||
field_entry = {
|
||||
'name': field_name,
|
||||
'value': field_info.get('/V', ''),
|
||||
'default_value': field_info.get('/DV', ''),
|
||||
'type': field_info.get('/FT', ''),
|
||||
'flags': field_info.get('/Ff', 0),
|
||||
'read_only': bool(field_info.get('/Ff', 0) & 1),
|
||||
'required': bool(field_info.get('/Ff', 0) & 2),
|
||||
}
|
||||
|
||||
# alternate field name if present
|
||||
if '/T' in field_info:
|
||||
field_entry['name'] = field_info['/T']
|
||||
|
||||
# tooltip/description
|
||||
if '/TU' in field_info:
|
||||
field_entry['tooltip'] = field_info['/TU']
|
||||
|
||||
form_data.append(field_entry)
|
||||
|
||||
return {
|
||||
'file': str(pdf_path),
|
||||
'field_count': len(form_data),
|
||||
'fields': form_data
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'file': str(pdf_path),
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def main():
|
||||
pdf_files = Path('.').rglob('*.pdf')
|
||||
found_count = 0
|
||||
|
||||
print(f"{'SCANNING FOR FORM FIELDS':=^60}")
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
result = extract_form_fields(pdf_file)
|
||||
|
||||
if result:
|
||||
if 'error' in result:
|
||||
# print(f"\nFile: {result['file']}")
|
||||
# print(f" [!] Error: {result['error']}")
|
||||
continue
|
||||
|
||||
found_count += 1
|
||||
print(f"\nFile: {result['file']}")
|
||||
print(f" Total Fields: {result['field_count']}")
|
||||
print("-" * 60)
|
||||
|
||||
for field in result['fields']:
|
||||
# clean up type string (e.g. '/Tx' -> 'Tx')
|
||||
f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
|
||||
|
||||
val = field['value']
|
||||
display_val = f"'{val}'" if val else "<empty>"
|
||||
|
||||
status = []
|
||||
if field['required']: status.append("REQ")
|
||||
if field['read_only']: status.append("RO")
|
||||
status_str = f"[{', '.join(status)}]" if status else ""
|
||||
|
||||
print(f" [{f_type:<3}] {field['name']}")
|
||||
print(f" Value: {display_val}")
|
||||
|
||||
if status_str:
|
||||
print(f" Flags: {status_str}")
|
||||
if field.get('tooltip'):
|
||||
print(f" Tip: {field['tooltip']}")
|
||||
|
||||
print("-" * 60)
|
||||
|
||||
print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue