#!/usr/bin/env python3 """ extract all annotations from PDFs recursively """ import fitz from pathlib import Path def extract_annotations(pdf_path): try: doc = fitz.open(pdf_path) annotations = [] for page_num, page in enumerate(doc, start=1): for annot in page.annots(): annot_type = annot.type[1] if annot.type else 'Unknown' annot_data = { 'page': page_num, 'type': annot_type, 'content': annot.info.get('content', '').strip(), 'subject': annot.info.get('subject', '').strip(), 'title': annot.info.get('title', '').strip(), 'created': annot.info.get('creationDate', ''), 'modified': annot.info.get('modDate', ''), 'rect': list(annot.rect), } annotations.append(annot_data) doc.close() if annotations: return { 'file': str(pdf_path), 'annotation_count': len(annotations), 'annotations': annotations } return None except Exception as e: return { 'file': str(pdf_path), 'error': str(e) } def main(): pdf_files = Path('.').rglob('*.pdf') found_count = 0 print(f"{'SCANNING FOR ANNOTATIONS':=^60}") for pdf_file in pdf_files: result = extract_annotations(pdf_file) if result: if 'error' in result: # print(f"Error scanning {result['file']}: {result['error']}") continue found_count += 1 print(f"\nFile: {result['file']}") print(f" Total Annotations: {result['annotation_count']}") print("-" * 60) for annot in result['annotations']: header = f"[Page {annot['page']} | {annot['type']}]" print(f" {header}") if annot['content']: print(f" Content: {annot['content']}") if annot['title']: print(f" Author: {annot['title']}") if annot['subject'] and annot['subject'] != annot['content']: print(f" Subject: {annot['subject']}") if annot['modified']: print(f" Date: {annot['modified']}") print("") print("-" * 60) print(f"\nExtraction complete. Found {found_count} PDFs with annotations.") if __name__ == '__main__': main()