Add scripts
This commit is contained in:
commit
cc682741d5
6 changed files with 437 additions and 0 deletions
88
extract_annots.py
Normal file
88
extract_annots.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
extract all annotations from PDFs recursively
|
||||
"""
|
||||
import fitz
|
||||
from pathlib import Path
|
||||
|
||||
def extract_annotations(pdf_path):
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
annotations = []
|
||||
|
||||
for page_num, page in enumerate(doc, start=1):
|
||||
for annot in page.annots():
|
||||
annot_type = annot.type[1] if annot.type else 'Unknown'
|
||||
|
||||
annot_data = {
|
||||
'page': page_num,
|
||||
'type': annot_type,
|
||||
'content': annot.info.get('content', '').strip(),
|
||||
'subject': annot.info.get('subject', '').strip(),
|
||||
'title': annot.info.get('title', '').strip(),
|
||||
'created': annot.info.get('creationDate', ''),
|
||||
'modified': annot.info.get('modDate', ''),
|
||||
'rect': list(annot.rect),
|
||||
}
|
||||
|
||||
annotations.append(annot_data)
|
||||
|
||||
doc.close()
|
||||
|
||||
if annotations:
|
||||
return {
|
||||
'file': str(pdf_path),
|
||||
'annotation_count': len(annotations),
|
||||
'annotations': annotations
|
||||
}
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'file': str(pdf_path),
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def main():
|
||||
pdf_files = Path('.').rglob('*.pdf')
|
||||
found_count = 0
|
||||
|
||||
print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
result = extract_annotations(pdf_file)
|
||||
|
||||
if result:
|
||||
if 'error' in result:
|
||||
# print(f"Error scanning {result['file']}: {result['error']}")
|
||||
continue
|
||||
|
||||
found_count += 1
|
||||
print(f"\nFile: {result['file']}")
|
||||
print(f" Total Annotations: {result['annotation_count']}")
|
||||
print("-" * 60)
|
||||
|
||||
for annot in result['annotations']:
|
||||
header = f"[Page {annot['page']} | {annot['type']}]"
|
||||
print(f" {header}")
|
||||
|
||||
if annot['content']:
|
||||
print(f" Content: {annot['content']}")
|
||||
|
||||
if annot['title']:
|
||||
print(f" Author: {annot['title']}")
|
||||
|
||||
if annot['subject'] and annot['subject'] != annot['content']:
|
||||
print(f" Subject: {annot['subject']}")
|
||||
|
||||
if annot['modified']:
|
||||
print(f" Date: {annot['modified']}")
|
||||
|
||||
print("")
|
||||
|
||||
print("-" * 60)
|
||||
|
||||
print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue