Add scripts

2026-01-17 20:18:15 +00:00 · 2026-01-17 20:18:15 +00:00 · cc682741d5
commit cc682741d5
6 changed files with 437 additions and 0 deletions
--- a/1
+++ b/1
@ -0,0 +1 @@
 Scripts for PDF forensics
--- a/extract_annots.py
+++ b/extract_annots.py
@ -0,0 +1,88 @@
 #!/usr/bin/env python3
 """
 extract all annotations from PDFs recursively
 """
 import fitz
 from pathlib import Path
 def extract_annotations(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        annotations = []
        for page_num, page in enumerate(doc, start=1):
            for annot in page.annots():
                annot_type = annot.type[1] if annot.type else 'Unknown'
                annot_data = {
                    'page': page_num,
                    'type': annot_type,
                    'content': annot.info.get('content', '').strip(),
                    'subject': annot.info.get('subject', '').strip(),
                    'title': annot.info.get('title', '').strip(),
                    'created': annot.info.get('creationDate', ''),
                    'modified': annot.info.get('modDate', ''),
                    'rect': list(annot.rect),
                }
                annotations.append(annot_data)
        doc.close()
        if annotations:
            return {
                'file': str(pdf_path),
                'annotation_count': len(annotations),
                'annotations': annotations
            }
        return None
    except Exception as e:
        return {
            'file': str(pdf_path),
            'error': str(e)
        }
 def main():
    pdf_files = Path('.').rglob('*.pdf')
    found_count = 0
    print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
    for pdf_file in pdf_files:
        result = extract_annotations(pdf_file)
        if result:
            if 'error' in result:
                # print(f"Error scanning {result['file']}: {result['error']}")
                continue
            found_count += 1
            print(f"\nFile: {result['file']}")
            print(f"  Total Annotations: {result['annotation_count']}")
            print("-" * 60)
            for annot in result['annotations']:
                header = f"[Page {annot['page']} | {annot['type']}]"
                print(f"  {header}")
                if annot['content']:
                    print(f"    Content: {annot['content']}")
                if annot['title']:
                    print(f"    Author:  {annot['title']}")
                if annot['subject'] and annot['subject'] != annot['content']:
                    print(f"    Subject: {annot['subject']}")
                if annot['modified']:
                    print(f"    Date:    {annot['modified']}")
                print("")
            print("-" * 60)
    print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
 if __name__ == '__main__':
    main()
--- a/extract_embedded_files.sh
+++ b/extract_embedded_files.sh
@ -0,0 +1,45 @@
 #!/bin/bash
 # extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
 #
 # deps: pdfdetach (poppler utils)
 OUTPUT_DIR="./extracted_attachments"
 LOG_FILE="attachment_extraction.log"
 mkdir -p "$OUTPUT_DIR"
 > "$LOG_FILE"
 echo "----------------------------------------"
 echo "Starting attachment extraction scan..."
 echo "Log: $LOG_FILE" 
 echo "Output directory: $OUTPUT_DIR"
 echo "----------------------------------------"
 find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
    REL_PATH="${PDF_FILE#./}"
    SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
    FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
    echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
    ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
    if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
        ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
        if [ "$ATTACH_COUNT" -gt 0 ]; then
            echo "  -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
            mkdir -p "$FILE_OUTPUT_DIR"
            pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
            echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
            echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
            echo "" >> "$LOG_FILE"
        fi
    fi
 done
 echo "----------------------------------------"
 echo "Extraction complete. Check $LOG_FILE for details."
--- a/extract_form_fields.py
+++ b/extract_form_fields.py
@ -0,0 +1,103 @@
 #!/usr/bin/env python3
 """
 extract form field data from PDFs recursively
 """
 from pypdf import PdfReader
 from pathlib import Path
 def extract_form_fields(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        if reader.is_encrypted:
            return {'file': str(pdf_path), 'error': 'File is encrypted'}
        fields = reader.get_fields()
        if fields:
            form_data = []
            for field_name, field_info in fields.items():
                if not field_info: 
                    continue
                field_entry = {
                    'name': field_name,
                    'value': field_info.get('/V', ''),
                    'default_value': field_info.get('/DV', ''),
                    'type': field_info.get('/FT', ''),
                    'flags': field_info.get('/Ff', 0),
                    'read_only': bool(field_info.get('/Ff', 0) & 1),
                    'required': bool(field_info.get('/Ff', 0) & 2),
                }
                # alternate field name if present
                if '/T' in field_info:
                    field_entry['name'] = field_info['/T']
                # tooltip/description
                if '/TU' in field_info:
                    field_entry['tooltip'] = field_info['/TU']
                form_data.append(field_entry)
            return {
                'file': str(pdf_path),
                'field_count': len(form_data),
                'fields': form_data
            }
        return None
    except Exception as e:
        return {
            'file': str(pdf_path),
            'error': str(e)
        }
 def main():
    pdf_files = Path('.').rglob('*.pdf')
    found_count = 0
    print(f"{'SCANNING FOR FORM FIELDS':=^60}")
    for pdf_file in pdf_files:
        result = extract_form_fields(pdf_file)
        if result:
            if 'error' in result:
                # print(f"\nFile: {result['file']}")
                # print(f"  [!] Error: {result['error']}")
                continue
            found_count += 1
            print(f"\nFile: {result['file']}")
            print(f"  Total Fields: {result['field_count']}")
            print("-" * 60)
            for field in result['fields']:
                # clean up type string (e.g. '/Tx' -> 'Tx')
                f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
                val = field['value']
                display_val = f"'{val}'" if val else "<empty>"
                status = []
                if field['required']: status.append("REQ")
                if field['read_only']: status.append("RO")
                status_str = f"[{', '.join(status)}]" if status else ""
                print(f"  [{f_type:<3}] {field['name']}")
                print(f"        Value: {display_val}")
                if status_str:
                    print(f"        Flags: {status_str}")
                if field.get('tooltip'):
                    print(f"        Tip:   {field['tooltip']}")
            print("-" * 60)
    print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
 if __name__ == '__main__':
    main()
--- a/extract_layers.py
+++ b/extract_layers.py
@ -0,0 +1,69 @@
 #!/usr/bin/env python3
 """
 analyze Optional Content Groups (Layers) in PDFs recursively
 """
 import fitz
 from pathlib import Path
 def analyze_ocg(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        ocgs = []
        if doc.is_pdf and hasattr(doc, 'get_ocgs'):
            ocg_list = doc.get_ocgs()
            if ocg_list:
                for ocg in ocg_list:
                    ocgs.append({
                        'name': ocg.get('name', 'Unknown'),
                        'on': ocg.get('on', None),
                        'intent': ocg.get('intent', []),
                        'usage': ocg.get('usage', {})
                    })
        doc.close()
        if ocgs:
            return {
                'file': str(pdf_path),
                'ocg_count': len(ocgs),
                'layers': ocgs
            }
        return None
    except Exception as e:
        return {
            'file': str(pdf_path),
            'error': str(e)
        }
 def main():
    pdf_files = Path('.').rglob('*.pdf')
    found_count = 0
    print(f"{'SCANNING FOR LAYERS':=^60}")
    for pdf_file in pdf_files:
        result = analyze_ocg(pdf_file)
        if result:
            found_count += 1
            print(f"\nFile: {result['file']}")
            if 'error' in result:
                print(f"  [!] Error: {result['error']}")
                print("-" * 60)
                continue
            print(f"  Layer Count: {result['ocg_count']}")
            print("  Layers:")
            for layer in result['layers']:
                state = "ON " if layer['on'] else "OFF"
                print(f"    [{state}] {layer['name']}")
            print("-" * 60)
    print(f"\nAnalysis complete. Found {found_count} PDFs with layers or errors.")
 if __name__ == '__main__':
    main()
--- a/extract_version_diffs.sh
+++ b/extract_version_diffs.sh
@ -0,0 +1,131 @@
 #!/bin/bash
 #
 # note that this tool uses pdfresurrect -w, which creates version directories
 # and extracts version files. to avoid modifying your directory structure, it's
 # a good idea to run this script in a copy of your project directory.
 #
 # deps: qpdf, pdfresurrect
 LOG_FILE="./resurrection_log.txt"
 FAILURE_LOG="./resurrection_failures.log"
 echo "--- Starting PDF resurrection scan ---" | tee "$LOG_FILE"
 echo "Diffs of modified objects will be logged directly to: $LOG_FILE" | tee -a "$LOG_FILE"
 echo "--------------------------------------" | tee -a "$LOG_FILE"
 # reset failure log
 > "$FAILURE_LOG"
 find . -type f -iname "*.pdf" -not -iname "*_text.pdf" -not -path "*/*-versions/*" | while read -r FULL_PATH; do
    FILE_DIR=$(dirname "$FULL_PATH")
    FILE_NAME=$(basename "$FULL_PATH")
    BASE_NAME="${FILE_NAME%.*}" # filename w/o ext
    echo "Checking: $FULL_PATH"
    (
        cd "$FILE_DIR" || { echo "ERROR: Cannot access $FILE_DIR" >> "$FAILURE_LOG"; exit 1; }
        VERSION_COUNT_OUTPUT=$(pdfresurrect -q "$FILE_NAME" 2>/dev/null)
        EXIT_CODE=$?
        if [ "$EXIT_CODE" -ne 0 ]; then
            echo "Failed to run pdfresurrect -q on $FULL_PATH (Exit $EXIT_CODE)." >> "$FAILURE_LOG"
            exit 1
        fi
        VERSION_COUNT=$(echo "$VERSION_COUNT_OUTPUT" | grep -oP ': \K\d+' | tail -n 1)
        if [ "$VERSION_COUNT" -le 1 ]; then
            echo "  -> No history found ($VERSION_COUNT version(s))."
            exit 0
        fi
        echo "  -> History detected ($VERSION_COUNT versions). Extracting..."
        pdfresurrect -w "$FILE_NAME" > /dev/null 2>&1
        SUMMARY_FILE="$BASE_NAME-versions/$BASE_NAME-versions.summary"
        if [ ! -f "$SUMMARY_FILE" ]; then
            echo "  -> CRITICAL ERROR: Summary file not created at $SUMMARY_FILE, skipping diff."
            exit 1
        fi
        RAW_MODIFIED_OBJECTS=$(awk '/: --[DM]--/ { print $4, $7 }' "$SUMMARY_FILE" | sort -u)
        echo "  -> Raw D/M object list (V OBJ):"
        echo "$RAW_MODIFIED_OBJECTS" | while read -r line; do
            [ -n "$line" ] && echo "     - $line"
        done
        while IFS= read -r line ; do
            CURRENT_VERSION=$(echo "$line" | awk '{print $1}')
            OBJ_ID=$(echo "$line" | awk '{print $2}')
            if [ -z "$CURRENT_VERSION" ] || [ -z "$OBJ_ID" ]; then
                [ -n "$line" ] && echo "   -> WARNING: Parsed empty version/object pair from line: \"$line\". Skipping."
                continue
            fi
            if ! [[ "$CURRENT_VERSION" =~ ^[0-9]+$ ]] || ! [[ "$OBJ_ID" =~ ^[0-9]+$ ]]; then
                echo "   -> ERROR: Parsed invalid numbers ($CURRENT_VERSION : $OBJ_ID) from summary. Skipping."
                continue
            fi
            if [ "$CURRENT_VERSION" -eq 1 ]; then
                 echo "   -> Skipping Object $OBJ_ID: Current version is V1, no V0 exists to diff against."
                 continue
            fi
            PREV_VERSION=$((CURRENT_VERSION - 1))
            V1_FILE="$BASE_NAME-versions/$BASE_NAME-version-$PREV_VERSION.pdf"
            V2_FILE="$BASE_NAME-versions/$BASE_NAME-version-$CURRENT_VERSION.pdf"
            TEMP_V1=$(mktemp)
            TEMP_V2=$(mktemp)
            qpdf --show-object="$OBJ_ID" --unfilter -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
            qpdf --show-object="$OBJ_ID" --unfilter -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
            if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
                if [ ! -s "$TEMP_V1" ]; then
                    qpdf --show-object="$OBJ_ID" -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
                fi
                if [ ! -s "$TEMP_V2" ]; then
                    qpdf --show-object="$OBJ_ID" -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
                fi
            fi
            if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
                echo "   -> NOTE: Object $OBJ_ID (V$PREV_VERSION or V$CURRENT_VERSION) extraction failed or produced empty output. Skipping diff."
                rm "$TEMP_V1" "$TEMP_V2"
                continue
            fi
            if diff -u "$TEMP_V1" "$TEMP_V2" >/dev/null ; then
                echo "   -> Object $OBJ_ID: No difference detected (V$PREV_VERSION vs V$CURRENT_VERSION)."
            else
                echo -e "\n\n--- DIFF START: $FULL_PATH | Object $OBJ_ID (V$PREV_VERSION vs V$CURRENT_VERSION) ---"
                echo "Note: Output extracted as raw PDF content for reliable comparison."
                diff -u "$TEMP_V1" "$TEMP_V2"
                echo "--- DIFF END: Object $OBJ_ID ---\n"
                DIFF_FOUND=1
            fi
            rm "$TEMP_V1" "$TEMP_V2"
        done < <(echo "$RAW_MODIFIED_OBJECTS")
    ) | tee -a "$LOG_FILE"
    if [ ${PIPESTATUS[0]} -ne 0 ]; then
        echo "Failed to process $FULL_PATH (See $FAILURE_LOG and $LOG_FILE)"
    fi
 done
 echo "----------------------------------------------------" | tee -a "$LOG_FILE"
 echo "Scan complete. Check $LOG_FILE for inline object diffs." | tee -a "$LOG_FILE"
 echo "Check $FAILURE_LOG for files that failed processing." | tee -a "$LOG_FILE"