pdf-forensics/extract_annots.py
2026-01-17 20:18:15 +00:00

88 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
extract all annotations from PDFs recursively
"""
import fitz
from pathlib import Path
def extract_annotations(pdf_path):
try:
doc = fitz.open(pdf_path)
annotations = []
for page_num, page in enumerate(doc, start=1):
for annot in page.annots():
annot_type = annot.type[1] if annot.type else 'Unknown'
annot_data = {
'page': page_num,
'type': annot_type,
'content': annot.info.get('content', '').strip(),
'subject': annot.info.get('subject', '').strip(),
'title': annot.info.get('title', '').strip(),
'created': annot.info.get('creationDate', ''),
'modified': annot.info.get('modDate', ''),
'rect': list(annot.rect),
}
annotations.append(annot_data)
doc.close()
if annotations:
return {
'file': str(pdf_path),
'annotation_count': len(annotations),
'annotations': annotations
}
return None
except Exception as e:
return {
'file': str(pdf_path),
'error': str(e)
}
def main():
pdf_files = Path('.').rglob('*.pdf')
found_count = 0
print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
for pdf_file in pdf_files:
result = extract_annotations(pdf_file)
if result:
if 'error' in result:
# print(f"Error scanning {result['file']}: {result['error']}")
continue
found_count += 1
print(f"\nFile: {result['file']}")
print(f" Total Annotations: {result['annotation_count']}")
print("-" * 60)
for annot in result['annotations']:
header = f"[Page {annot['page']} | {annot['type']}]"
print(f" {header}")
if annot['content']:
print(f" Content: {annot['content']}")
if annot['title']:
print(f" Author: {annot['title']}")
if annot['subject'] and annot['subject'] != annot['content']:
print(f" Subject: {annot['subject']}")
if annot['modified']:
print(f" Date: {annot['modified']}")
print("")
print("-" * 60)
print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
if __name__ == '__main__':
main()