From cc682741d5e5f8b9f57736d5592a19d72ce57ce8 Mon Sep 17 00:00:00 2001
From: libroot <libroot@refuse.fi>
Date: Sat, 17 Jan 2026 20:18:15 +0000
Subject: [PATCH] Add scripts

---
 README                    |   1 +
 extract_annots.py         |  88 +++++++++++++++++++++++++
 extract_embedded_files.sh |  45 +++++++++++++
 extract_form_fields.py    | 103 ++++++++++++++++++++++++++++++
 extract_layers.py         |  69 ++++++++++++++++++++
 extract_version_diffs.sh  | 131 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 437 insertions(+)
 create mode 100644 README
 create mode 100644 extract_annots.py
 create mode 100644 extract_embedded_files.sh
 create mode 100644 extract_form_fields.py
 create mode 100644 extract_layers.py
 create mode 100644 extract_version_diffs.sh

diff --git a/README b/README
new file mode 100644
index 0000000..a2b974d
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+Scripts for PDF forensics
diff --git a/extract_annots.py b/extract_annots.py
new file mode 100644
index 0000000..72ea619
--- /dev/null
+++ b/extract_annots.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+extract all annotations from PDFs recursively
+"""
+import fitz
+from pathlib import Path
+
+def extract_annotations(pdf_path):
+    try:
+        doc = fitz.open(pdf_path)
+        annotations = []
+
+        for page_num, page in enumerate(doc, start=1):
+            for annot in page.annots():
+                annot_type = annot.type[1] if annot.type else 'Unknown'
+
+                annot_data = {
+                    'page': page_num,
+                    'type': annot_type,
+                    'content': annot.info.get('content', '').strip(),
+                    'subject': annot.info.get('subject', '').strip(),
+                    'title': annot.info.get('title', '').strip(),
+                    'created': annot.info.get('creationDate', ''),
+                    'modified': annot.info.get('modDate', ''),
+                    'rect': list(annot.rect),
+                }
+
+                annotations.append(annot_data)
+
+        doc.close()
+
+        if annotations:
+            return {
+                'file': str(pdf_path),
+                'annotation_count': len(annotations),
+                'annotations': annotations
+            }
+        return None
+
+    except Exception as e:
+        return {
+            'file': str(pdf_path),
+            'error': str(e)
+        }
+
+def main():
+    pdf_files = Path('.').rglob('*.pdf')
+    found_count = 0
+
+    print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
+
+    for pdf_file in pdf_files:
+        result = extract_annotations(pdf_file)
+
+        if result:
+            if 'error' in result:
+                # print(f"Error scanning {result['file']}: {result['error']}")
+                continue
+
+            found_count += 1
+            print(f"\nFile: {result['file']}")
+            print(f"  Total Annotations: {result['annotation_count']}")
+            print("-" * 60)
+
+            for annot in result['annotations']:
+                header = f"[Page {annot['page']} | {annot['type']}]"
+                print(f"  {header}")
+
+                if annot['content']:
+                    print(f"    Content: {annot['content']}")
+
+                if annot['title']:
+                    print(f"    Author:  {annot['title']}")
+
+                if annot['subject'] and annot['subject'] != annot['content']:
+                    print(f"    Subject: {annot['subject']}")
+
+                if annot['modified']:
+                    print(f"    Date:    {annot['modified']}")
+
+                print("")
+
+            print("-" * 60)
+
+    print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
+
+if __name__ == '__main__':
+    main()
diff --git a/extract_embedded_files.sh b/extract_embedded_files.sh
new file mode 100644
index 0000000..6c17751
--- /dev/null
+++ b/extract_embedded_files.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
+#
+# deps: pdfdetach (poppler utils)
+
+OUTPUT_DIR="./extracted_attachments"
+LOG_FILE="attachment_extraction.log"
+
+mkdir -p "$OUTPUT_DIR"
+> "$LOG_FILE"
+
+echo "----------------------------------------"
+echo "Starting attachment extraction scan..."
+echo "Log: $LOG_FILE" 
+echo "Output directory: $OUTPUT_DIR"
+echo "----------------------------------------"
+
+find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
+    REL_PATH="${PDF_FILE#./}"
+    SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
+    FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
+
+    echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
+
+    ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
+
+    if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
+        ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
+
+        if [ "$ATTACH_COUNT" -gt 0 ]; then
+            echo "  -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
+
+            mkdir -p "$FILE_OUTPUT_DIR"
+
+            pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
+
+            echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
+            echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
+            echo "" >> "$LOG_FILE"
+        fi
+    fi
+done
+
+echo "----------------------------------------"
+echo "Extraction complete. Check $LOG_FILE for details."
diff --git a/extract_form_fields.py b/extract_form_fields.py
new file mode 100644
index 0000000..cbb97bf
--- /dev/null
+++ b/extract_form_fields.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+extract form field data from PDFs recursively
+"""
+from pypdf import PdfReader
+from pathlib import Path
+
+def extract_form_fields(pdf_path):
+    try:
+        reader = PdfReader(pdf_path)
+
+        if reader.is_encrypted:
+            return {'file': str(pdf_path), 'error': 'File is encrypted'}
+
+        fields = reader.get_fields()
+
+        if fields:
+            form_data = []
+
+            for field_name, field_info in fields.items():
+                if not field_info: 
+                    continue
+
+                field_entry = {
+                    'name': field_name,
+                    'value': field_info.get('/V', ''),
+                    'default_value': field_info.get('/DV', ''),
+                    'type': field_info.get('/FT', ''),
+                    'flags': field_info.get('/Ff', 0),
+                    'read_only': bool(field_info.get('/Ff', 0) & 1),
+                    'required': bool(field_info.get('/Ff', 0) & 2),
+                }
+
+                # alternate field name if present
+                if '/T' in field_info:
+                    field_entry['name'] = field_info['/T']
+
+                # tooltip/description
+                if '/TU' in field_info:
+                    field_entry['tooltip'] = field_info['/TU']
+
+                form_data.append(field_entry)
+
+            return {
+                'file': str(pdf_path),
+                'field_count': len(form_data),
+                'fields': form_data
+            }
+
+        return None
+
+    except Exception as e:
+        return {
+            'file': str(pdf_path),
+            'error': str(e)
+        }
+
+def main():
+    pdf_files = Path('.').rglob('*.pdf')
+    found_count = 0
+
+    print(f"{'SCANNING FOR FORM FIELDS':=^60}")
+
+    for pdf_file in pdf_files:
+        result = extract_form_fields(pdf_file)
+
+        if result:
+            if 'error' in result:
+                # print(f"\nFile: {result['file']}")
+                # print(f"  [!] Error: {result['error']}")
+                continue
+
+            found_count += 1
+            print(f"\nFile: {result['file']}")
+            print(f"  Total Fields: {result['field_count']}")
+            print("-" * 60)
+
+            for field in result['fields']:
+                # clean up type string (e.g. '/Tx' -> 'Tx')
+                f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
+
+                val = field['value']
+                display_val = f"'{val}'" if val else "<empty>"
+
+                status = []
+                if field['required']: status.append("REQ")
+                if field['read_only']: status.append("RO")
+                status_str = f"[{', '.join(status)}]" if status else ""
+
+                print(f"  [{f_type:<3}] {field['name']}")
+                print(f"        Value: {display_val}")
+
+                if status_str:
+                    print(f"        Flags: {status_str}")
+                if field.get('tooltip'):
+                    print(f"        Tip:   {field['tooltip']}")
+
+            print("-" * 60)
+
+    print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
+
+if __name__ == '__main__':
+    main()
diff --git a/extract_layers.py b/extract_layers.py
new file mode 100644
index 0000000..8bcc7a5
--- /dev/null
+++ b/extract_layers.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+analyze Optional Content Groups (Layers) in PDFs recursively
+"""
+import fitz
+from pathlib import Path
+
+def analyze_ocg(pdf_path):
+    try:
+        doc = fitz.open(pdf_path)
+        ocgs = []
+
+        if doc.is_pdf and hasattr(doc, 'get_ocgs'):
+            ocg_list = doc.get_ocgs()
+            if ocg_list:
+                for ocg in ocg_list:
+                    ocgs.append({
+                        'name': ocg.get('name', 'Unknown'),
+                        'on': ocg.get('on', None),
+                        'intent': ocg.get('intent', []),
+                        'usage': ocg.get('usage', {})
+                    })
+
+        doc.close()
+
+        if ocgs:
+            return {
+                'file': str(pdf_path),
+                'ocg_count': len(ocgs),
+                'layers': ocgs
+            }
+        return None
+
+    except Exception as e:
+        return {
+            'file': str(pdf_path),
+            'error': str(e)
+        }
+
+def main():
+    pdf_files = Path('.').rglob('*.pdf')
+    found_count = 0
+
+    print(f"{'SCANNING FOR LAYERS':=^60}")
+
+    for pdf_file in pdf_files:
+        result = analyze_ocg(pdf_file)
+
+        if result:
+            found_count += 1
+            print(f"\nFile: {result['file']}")
+
+            if 'error' in result:
+                print(f"  [!] Error: {result['error']}")
+                print("-" * 60)
+                continue
+
+            print(f"  Layer Count: {result['ocg_count']}")
+            print("  Layers:")
+
+            for layer in result['layers']:
+                state = "ON " if layer['on'] else "OFF"
+                print(f"    [{state}] {layer['name']}")
+            print("-" * 60)
+
+    print(f"\nAnalysis complete. Found {found_count} PDFs with layers or errors.")
+
+if __name__ == '__main__':
+    main()
diff --git a/extract_version_diffs.sh b/extract_version_diffs.sh
new file mode 100644
index 0000000..4e41751
--- /dev/null
+++ b/extract_version_diffs.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+#
+# note that this tool uses pdfresurrect -w, which creates version directories
+# and extracts version files. to avoid modifying your directory structure, it's
+# a good idea to run this script in a copy of your project directory.
+#
+# deps: qpdf, pdfresurrect
+
+LOG_FILE="./resurrection_log.txt"
+FAILURE_LOG="./resurrection_failures.log"
+
+echo "--- Starting PDF resurrection scan ---" | tee "$LOG_FILE"
+echo "Diffs of modified objects will be logged directly to: $LOG_FILE" | tee -a "$LOG_FILE"
+echo "--------------------------------------" | tee -a "$LOG_FILE"
+
+# reset failure log
+> "$FAILURE_LOG"
+
+find . -type f -iname "*.pdf" -not -iname "*_text.pdf" -not -path "*/*-versions/*" | while read -r FULL_PATH; do
+
+    FILE_DIR=$(dirname "$FULL_PATH")
+    FILE_NAME=$(basename "$FULL_PATH")
+    BASE_NAME="${FILE_NAME%.*}" # filename w/o ext
+
+    echo "Checking: $FULL_PATH"
+
+    (
+        cd "$FILE_DIR" || { echo "ERROR: Cannot access $FILE_DIR" >> "$FAILURE_LOG"; exit 1; }
+
+        VERSION_COUNT_OUTPUT=$(pdfresurrect -q "$FILE_NAME" 2>/dev/null)
+        EXIT_CODE=$?
+
+        if [ "$EXIT_CODE" -ne 0 ]; then
+            echo "Failed to run pdfresurrect -q on $FULL_PATH (Exit $EXIT_CODE)." >> "$FAILURE_LOG"
+            exit 1
+        fi
+
+        VERSION_COUNT=$(echo "$VERSION_COUNT_OUTPUT" | grep -oP ': \K\d+' | tail -n 1)
+
+        if [ "$VERSION_COUNT" -le 1 ]; then
+            echo "  -> No history found ($VERSION_COUNT version(s))."
+            exit 0
+        fi
+
+        echo "  -> History detected ($VERSION_COUNT versions). Extracting..."
+
+        pdfresurrect -w "$FILE_NAME" > /dev/null 2>&1
+
+        SUMMARY_FILE="$BASE_NAME-versions/$BASE_NAME-versions.summary"
+
+        if [ ! -f "$SUMMARY_FILE" ]; then
+            echo "  -> CRITICAL ERROR: Summary file not created at $SUMMARY_FILE, skipping diff."
+            exit 1
+        fi
+
+        RAW_MODIFIED_OBJECTS=$(awk '/: --[DM]--/ { print $4, $7 }' "$SUMMARY_FILE" | sort -u)
+
+        echo "  -> Raw D/M object list (V OBJ):"
+        echo "$RAW_MODIFIED_OBJECTS" | while read -r line; do
+            [ -n "$line" ] && echo "     - $line"
+        done
+
+        while IFS= read -r line ; do
+            CURRENT_VERSION=$(echo "$line" | awk '{print $1}')
+            OBJ_ID=$(echo "$line" | awk '{print $2}')
+            
+            if [ -z "$CURRENT_VERSION" ] || [ -z "$OBJ_ID" ]; then
+                [ -n "$line" ] && echo "   -> WARNING: Parsed empty version/object pair from line: \"$line\". Skipping."
+                continue
+            fi
+
+            if ! [[ "$CURRENT_VERSION" =~ ^[0-9]+$ ]] || ! [[ "$OBJ_ID" =~ ^[0-9]+$ ]]; then
+                echo "   -> ERROR: Parsed invalid numbers ($CURRENT_VERSION : $OBJ_ID) from summary. Skipping."
+                continue
+            fi
+
+            if [ "$CURRENT_VERSION" -eq 1 ]; then
+                 echo "   -> Skipping Object $OBJ_ID: Current version is V1, no V0 exists to diff against."
+                 continue
+            fi
+            
+            PREV_VERSION=$((CURRENT_VERSION - 1))
+
+            V1_FILE="$BASE_NAME-versions/$BASE_NAME-version-$PREV_VERSION.pdf"
+            V2_FILE="$BASE_NAME-versions/$BASE_NAME-version-$CURRENT_VERSION.pdf"
+    
+            TEMP_V1=$(mktemp)
+            TEMP_V2=$(mktemp)
+
+            qpdf --show-object="$OBJ_ID" --unfilter -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
+            qpdf --show-object="$OBJ_ID" --unfilter -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
+
+            if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
+                if [ ! -s "$TEMP_V1" ]; then
+                    qpdf --show-object="$OBJ_ID" -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
+                fi
+                
+                if [ ! -s "$TEMP_V2" ]; then
+                    qpdf --show-object="$OBJ_ID" -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
+                fi
+            fi
+            if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
+                echo "   -> NOTE: Object $OBJ_ID (V$PREV_VERSION or V$CURRENT_VERSION) extraction failed or produced empty output. Skipping diff."
+                rm "$TEMP_V1" "$TEMP_V2"
+                continue
+            fi
+
+            if diff -u "$TEMP_V1" "$TEMP_V2" >/dev/null ; then
+                echo "   -> Object $OBJ_ID: No difference detected (V$PREV_VERSION vs V$CURRENT_VERSION)."
+            else
+                echo -e "\n\n--- DIFF START: $FULL_PATH | Object $OBJ_ID (V$PREV_VERSION vs V$CURRENT_VERSION) ---"
+                echo "Note: Output extracted as raw PDF content for reliable comparison."
+                diff -u "$TEMP_V1" "$TEMP_V2"
+                echo "--- DIFF END: Object $OBJ_ID ---\n"
+                DIFF_FOUND=1
+            fi
+
+
+            rm "$TEMP_V1" "$TEMP_V2"
+        done < <(echo "$RAW_MODIFIED_OBJECTS")
+
+
+    ) | tee -a "$LOG_FILE"
+
+    if [ ${PIPESTATUS[0]} -ne 0 ]; then
+        echo "Failed to process $FULL_PATH (See $FAILURE_LOG and $LOG_FILE)"
+    fi
+done
+echo "----------------------------------------------------" | tee -a "$LOG_FILE"
+echo "Scan complete. Check $LOG_FILE for inline object diffs." | tee -a "$LOG_FILE"
+echo "Check $FAILURE_LOG for files that failed processing." | tee -a "$LOG_FILE"