88 lines
2.5 KiB
Python
88 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract all annotations from PDFs recursively
|
|
"""
|
|
import fitz
|
|
from pathlib import Path
|
|
|
|
def extract_annotations(pdf_path):
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
annotations = []
|
|
|
|
for page_num, page in enumerate(doc, start=1):
|
|
for annot in page.annots():
|
|
annot_type = annot.type[1] if annot.type else 'Unknown'
|
|
|
|
annot_data = {
|
|
'page': page_num,
|
|
'type': annot_type,
|
|
'content': annot.info.get('content', '').strip(),
|
|
'subject': annot.info.get('subject', '').strip(),
|
|
'title': annot.info.get('title', '').strip(),
|
|
'created': annot.info.get('creationDate', ''),
|
|
'modified': annot.info.get('modDate', ''),
|
|
'rect': list(annot.rect),
|
|
}
|
|
|
|
annotations.append(annot_data)
|
|
|
|
doc.close()
|
|
|
|
if annotations:
|
|
return {
|
|
'file': str(pdf_path),
|
|
'annotation_count': len(annotations),
|
|
'annotations': annotations
|
|
}
|
|
return None
|
|
|
|
except Exception as e:
|
|
return {
|
|
'file': str(pdf_path),
|
|
'error': str(e)
|
|
}
|
|
|
|
def main():
|
|
pdf_files = Path('.').rglob('*.pdf')
|
|
found_count = 0
|
|
|
|
print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
|
|
|
|
for pdf_file in pdf_files:
|
|
result = extract_annotations(pdf_file)
|
|
|
|
if result:
|
|
if 'error' in result:
|
|
# print(f"Error scanning {result['file']}: {result['error']}")
|
|
continue
|
|
|
|
found_count += 1
|
|
print(f"\nFile: {result['file']}")
|
|
print(f" Total Annotations: {result['annotation_count']}")
|
|
print("-" * 60)
|
|
|
|
for annot in result['annotations']:
|
|
header = f"[Page {annot['page']} | {annot['type']}]"
|
|
print(f" {header}")
|
|
|
|
if annot['content']:
|
|
print(f" Content: {annot['content']}")
|
|
|
|
if annot['title']:
|
|
print(f" Author: {annot['title']}")
|
|
|
|
if annot['subject'] and annot['subject'] != annot['content']:
|
|
print(f" Subject: {annot['subject']}")
|
|
|
|
if annot['modified']:
|
|
print(f" Date: {annot['modified']}")
|
|
|
|
print("")
|
|
|
|
print("-" * 60)
|
|
|
|
print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|