pdf-forensics/extract_annots.py

#!/usr/bin/env python3
"""
extract all annotations from PDFs recursively
"""
import fitz
from pathlib import Path

def extract_annotations(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        annotations = []

        for page_num, page in enumerate(doc, start=1):
            for annot in page.annots():
                annot_type = annot.type[1] if annot.type else 'Unknown'

                annot_data = {
                    'page': page_num,
                    'type': annot_type,
                    'content': annot.info.get('content', '').strip(),
                    'subject': annot.info.get('subject', '').strip(),
                    'title': annot.info.get('title', '').strip(),
                    'created': annot.info.get('creationDate', ''),
                    'modified': annot.info.get('modDate', ''),
                    'rect': list(annot.rect),
                }

                annotations.append(annot_data)

        doc.close()

        if annotations:
            return {
                'file': str(pdf_path),
                'annotation_count': len(annotations),
                'annotations': annotations
            }
        return None

    except Exception as e:
        return {
            'file': str(pdf_path),
            'error': str(e)
        }

def main():
    pdf_files = Path('.').rglob('*.pdf')
    found_count = 0

    print(f"{'SCANNING FOR ANNOTATIONS':=^60}")

    for pdf_file in pdf_files:
        result = extract_annotations(pdf_file)

        if result:
            if 'error' in result:
                # print(f"Error scanning {result['file']}: {result['error']}")
                continue

            found_count += 1
            print(f"\nFile: {result['file']}")
            print(f"  Total Annotations: {result['annotation_count']}")
            print("-" * 60)

            for annot in result['annotations']:
                header = f"[Page {annot['page']} | {annot['type']}]"
                print(f"  {header}")

                if annot['content']:
                    print(f"    Content: {annot['content']}")

                if annot['title']:
                    print(f"    Author:  {annot['title']}")

                if annot['subject'] and annot['subject'] != annot['content']:
                    print(f"    Subject: {annot['subject']}")

                if annot['modified']:
                    print(f"    Date:    {annot['modified']}")

                print("")

            print("-" * 60)

    print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")

if __name__ == '__main__':
    main()