From cc682741d5e5f8b9f57736d5592a19d72ce57ce8 Mon Sep 17 00:00:00 2001 From: libroot Date: Sat, 17 Jan 2026 20:18:15 +0000 Subject: [PATCH] Add scripts --- README | 1 + extract_annots.py | 88 +++++++++++++++++++++++++ extract_embedded_files.sh | 45 +++++++++++++ extract_form_fields.py | 103 ++++++++++++++++++++++++++++++ extract_layers.py | 69 ++++++++++++++++++++ extract_version_diffs.sh | 131 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 437 insertions(+) create mode 100644 README create mode 100644 extract_annots.py create mode 100644 extract_embedded_files.sh create mode 100644 extract_form_fields.py create mode 100644 extract_layers.py create mode 100644 extract_version_diffs.sh diff --git a/README b/README new file mode 100644 index 0000000..a2b974d --- /dev/null +++ b/README @@ -0,0 +1 @@ +Scripts for PDF forensics diff --git a/extract_annots.py b/extract_annots.py new file mode 100644 index 0000000..72ea619 --- /dev/null +++ b/extract_annots.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +extract all annotations from PDFs recursively +""" +import fitz +from pathlib import Path + +def extract_annotations(pdf_path): + try: + doc = fitz.open(pdf_path) + annotations = [] + + for page_num, page in enumerate(doc, start=1): + for annot in page.annots(): + annot_type = annot.type[1] if annot.type else 'Unknown' + + annot_data = { + 'page': page_num, + 'type': annot_type, + 'content': annot.info.get('content', '').strip(), + 'subject': annot.info.get('subject', '').strip(), + 'title': annot.info.get('title', '').strip(), + 'created': annot.info.get('creationDate', ''), + 'modified': annot.info.get('modDate', ''), + 'rect': list(annot.rect), + } + + annotations.append(annot_data) + + doc.close() + + if annotations: + return { + 'file': str(pdf_path), + 'annotation_count': len(annotations), + 'annotations': annotations + } + return None + + except Exception as e: + return { + 'file': str(pdf_path), + 'error': str(e) + } + +def main(): + pdf_files = Path('.').rglob('*.pdf') + found_count = 0 + + print(f"{'SCANNING FOR ANNOTATIONS':=^60}") + + for pdf_file in pdf_files: + result = extract_annotations(pdf_file) + + if result: + if 'error' in result: + # print(f"Error scanning {result['file']}: {result['error']}") + continue + + found_count += 1 + print(f"\nFile: {result['file']}") + print(f" Total Annotations: {result['annotation_count']}") + print("-" * 60) + + for annot in result['annotations']: + header = f"[Page {annot['page']} | {annot['type']}]" + print(f" {header}") + + if annot['content']: + print(f" Content: {annot['content']}") + + if annot['title']: + print(f" Author: {annot['title']}") + + if annot['subject'] and annot['subject'] != annot['content']: + print(f" Subject: {annot['subject']}") + + if annot['modified']: + print(f" Date: {annot['modified']}") + + print("") + + print("-" * 60) + + print(f"\nExtraction complete. Found {found_count} PDFs with annotations.") + +if __name__ == '__main__': + main() diff --git a/extract_embedded_files.sh b/extract_embedded_files.sh new file mode 100644 index 0000000..6c17751 --- /dev/null +++ b/extract_embedded_files.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir +# +# deps: pdfdetach (poppler utils) + +OUTPUT_DIR="./extracted_attachments" +LOG_FILE="attachment_extraction.log" + +mkdir -p "$OUTPUT_DIR" +> "$LOG_FILE" + +echo "----------------------------------------" +echo "Starting attachment extraction scan..." +echo "Log: $LOG_FILE" +echo "Output directory: $OUTPUT_DIR" +echo "----------------------------------------" + +find . -type f -iname "*.pdf" | while read -r PDF_FILE; do + REL_PATH="${PDF_FILE#./}" + SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//') + FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME" + + echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE" + + ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null) + + if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then + ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l) + + if [ "$ATTACH_COUNT" -gt 0 ]; then + echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE" + + mkdir -p "$FILE_OUTPUT_DIR" + + pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE" + + echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE" + echo "$ATTACHMENT_LIST" >> "$LOG_FILE" + echo "" >> "$LOG_FILE" + fi + fi +done + +echo "----------------------------------------" +echo "Extraction complete. Check $LOG_FILE for details." diff --git a/extract_form_fields.py b/extract_form_fields.py new file mode 100644 index 0000000..cbb97bf --- /dev/null +++ b/extract_form_fields.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +extract form field data from PDFs recursively +""" +from pypdf import PdfReader +from pathlib import Path + +def extract_form_fields(pdf_path): + try: + reader = PdfReader(pdf_path) + + if reader.is_encrypted: + return {'file': str(pdf_path), 'error': 'File is encrypted'} + + fields = reader.get_fields() + + if fields: + form_data = [] + + for field_name, field_info in fields.items(): + if not field_info: + continue + + field_entry = { + 'name': field_name, + 'value': field_info.get('/V', ''), + 'default_value': field_info.get('/DV', ''), + 'type': field_info.get('/FT', ''), + 'flags': field_info.get('/Ff', 0), + 'read_only': bool(field_info.get('/Ff', 0) & 1), + 'required': bool(field_info.get('/Ff', 0) & 2), + } + + # alternate field name if present + if '/T' in field_info: + field_entry['name'] = field_info['/T'] + + # tooltip/description + if '/TU' in field_info: + field_entry['tooltip'] = field_info['/TU'] + + form_data.append(field_entry) + + return { + 'file': str(pdf_path), + 'field_count': len(form_data), + 'fields': form_data + } + + return None + + except Exception as e: + return { + 'file': str(pdf_path), + 'error': str(e) + } + +def main(): + pdf_files = Path('.').rglob('*.pdf') + found_count = 0 + + print(f"{'SCANNING FOR FORM FIELDS':=^60}") + + for pdf_file in pdf_files: + result = extract_form_fields(pdf_file) + + if result: + if 'error' in result: + # print(f"\nFile: {result['file']}") + # print(f" [!] Error: {result['error']}") + continue + + found_count += 1 + print(f"\nFile: {result['file']}") + print(f" Total Fields: {result['field_count']}") + print("-" * 60) + + for field in result['fields']: + # clean up type string (e.g. '/Tx' -> 'Tx') + f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK' + + val = field['value'] + display_val = f"'{val}'" if val else "" + + status = [] + if field['required']: status.append("REQ") + if field['read_only']: status.append("RO") + status_str = f"[{', '.join(status)}]" if status else "" + + print(f" [{f_type:<3}] {field['name']}") + print(f" Value: {display_val}") + + if status_str: + print(f" Flags: {status_str}") + if field.get('tooltip'): + print(f" Tip: {field['tooltip']}") + + print("-" * 60) + + print(f"\nExtraction complete. Found {found_count} PDFs with form fields.") + +if __name__ == '__main__': + main() diff --git a/extract_layers.py b/extract_layers.py new file mode 100644 index 0000000..8bcc7a5 --- /dev/null +++ b/extract_layers.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +analyze Optional Content Groups (Layers) in PDFs recursively +""" +import fitz +from pathlib import Path + +def analyze_ocg(pdf_path): + try: + doc = fitz.open(pdf_path) + ocgs = [] + + if doc.is_pdf and hasattr(doc, 'get_ocgs'): + ocg_list = doc.get_ocgs() + if ocg_list: + for ocg in ocg_list: + ocgs.append({ + 'name': ocg.get('name', 'Unknown'), + 'on': ocg.get('on', None), + 'intent': ocg.get('intent', []), + 'usage': ocg.get('usage', {}) + }) + + doc.close() + + if ocgs: + return { + 'file': str(pdf_path), + 'ocg_count': len(ocgs), + 'layers': ocgs + } + return None + + except Exception as e: + return { + 'file': str(pdf_path), + 'error': str(e) + } + +def main(): + pdf_files = Path('.').rglob('*.pdf') + found_count = 0 + + print(f"{'SCANNING FOR LAYERS':=^60}") + + for pdf_file in pdf_files: + result = analyze_ocg(pdf_file) + + if result: + found_count += 1 + print(f"\nFile: {result['file']}") + + if 'error' in result: + print(f" [!] Error: {result['error']}") + print("-" * 60) + continue + + print(f" Layer Count: {result['ocg_count']}") + print(" Layers:") + + for layer in result['layers']: + state = "ON " if layer['on'] else "OFF" + print(f" [{state}] {layer['name']}") + print("-" * 60) + + print(f"\nAnalysis complete. Found {found_count} PDFs with layers or errors.") + +if __name__ == '__main__': + main() diff --git a/extract_version_diffs.sh b/extract_version_diffs.sh new file mode 100644 index 0000000..4e41751 --- /dev/null +++ b/extract_version_diffs.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# note that this tool uses pdfresurrect -w, which creates version directories +# and extracts version files. to avoid modifying your directory structure, it's +# a good idea to run this script in a copy of your project directory. +# +# deps: qpdf, pdfresurrect + +LOG_FILE="./resurrection_log.txt" +FAILURE_LOG="./resurrection_failures.log" + +echo "--- Starting PDF resurrection scan ---" | tee "$LOG_FILE" +echo "Diffs of modified objects will be logged directly to: $LOG_FILE" | tee -a "$LOG_FILE" +echo "--------------------------------------" | tee -a "$LOG_FILE" + +# reset failure log +> "$FAILURE_LOG" + +find . -type f -iname "*.pdf" -not -iname "*_text.pdf" -not -path "*/*-versions/*" | while read -r FULL_PATH; do + + FILE_DIR=$(dirname "$FULL_PATH") + FILE_NAME=$(basename "$FULL_PATH") + BASE_NAME="${FILE_NAME%.*}" # filename w/o ext + + echo "Checking: $FULL_PATH" + + ( + cd "$FILE_DIR" || { echo "ERROR: Cannot access $FILE_DIR" >> "$FAILURE_LOG"; exit 1; } + + VERSION_COUNT_OUTPUT=$(pdfresurrect -q "$FILE_NAME" 2>/dev/null) + EXIT_CODE=$? + + if [ "$EXIT_CODE" -ne 0 ]; then + echo "Failed to run pdfresurrect -q on $FULL_PATH (Exit $EXIT_CODE)." >> "$FAILURE_LOG" + exit 1 + fi + + VERSION_COUNT=$(echo "$VERSION_COUNT_OUTPUT" | grep -oP ': \K\d+' | tail -n 1) + + if [ "$VERSION_COUNT" -le 1 ]; then + echo " -> No history found ($VERSION_COUNT version(s))." + exit 0 + fi + + echo " -> History detected ($VERSION_COUNT versions). Extracting..." + + pdfresurrect -w "$FILE_NAME" > /dev/null 2>&1 + + SUMMARY_FILE="$BASE_NAME-versions/$BASE_NAME-versions.summary" + + if [ ! -f "$SUMMARY_FILE" ]; then + echo " -> CRITICAL ERROR: Summary file not created at $SUMMARY_FILE, skipping diff." + exit 1 + fi + + RAW_MODIFIED_OBJECTS=$(awk '/: --[DM]--/ { print $4, $7 }' "$SUMMARY_FILE" | sort -u) + + echo " -> Raw D/M object list (V OBJ):" + echo "$RAW_MODIFIED_OBJECTS" | while read -r line; do + [ -n "$line" ] && echo " - $line" + done + + while IFS= read -r line ; do + CURRENT_VERSION=$(echo "$line" | awk '{print $1}') + OBJ_ID=$(echo "$line" | awk '{print $2}') + + if [ -z "$CURRENT_VERSION" ] || [ -z "$OBJ_ID" ]; then + [ -n "$line" ] && echo " -> WARNING: Parsed empty version/object pair from line: \"$line\". Skipping." + continue + fi + + if ! [[ "$CURRENT_VERSION" =~ ^[0-9]+$ ]] || ! [[ "$OBJ_ID" =~ ^[0-9]+$ ]]; then + echo " -> ERROR: Parsed invalid numbers ($CURRENT_VERSION : $OBJ_ID) from summary. Skipping." + continue + fi + + if [ "$CURRENT_VERSION" -eq 1 ]; then + echo " -> Skipping Object $OBJ_ID: Current version is V1, no V0 exists to diff against." + continue + fi + + PREV_VERSION=$((CURRENT_VERSION - 1)) + + V1_FILE="$BASE_NAME-versions/$BASE_NAME-version-$PREV_VERSION.pdf" + V2_FILE="$BASE_NAME-versions/$BASE_NAME-version-$CURRENT_VERSION.pdf" + + TEMP_V1=$(mktemp) + TEMP_V2=$(mktemp) + + qpdf --show-object="$OBJ_ID" --unfilter -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null + qpdf --show-object="$OBJ_ID" --unfilter -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null + + if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then + if [ ! -s "$TEMP_V1" ]; then + qpdf --show-object="$OBJ_ID" -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null + fi + + if [ ! -s "$TEMP_V2" ]; then + qpdf --show-object="$OBJ_ID" -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null + fi + fi + if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then + echo " -> NOTE: Object $OBJ_ID (V$PREV_VERSION or V$CURRENT_VERSION) extraction failed or produced empty output. Skipping diff." + rm "$TEMP_V1" "$TEMP_V2" + continue + fi + + if diff -u "$TEMP_V1" "$TEMP_V2" >/dev/null ; then + echo " -> Object $OBJ_ID: No difference detected (V$PREV_VERSION vs V$CURRENT_VERSION)." + else + echo -e "\n\n--- DIFF START: $FULL_PATH | Object $OBJ_ID (V$PREV_VERSION vs V$CURRENT_VERSION) ---" + echo "Note: Output extracted as raw PDF content for reliable comparison." + diff -u "$TEMP_V1" "$TEMP_V2" + echo "--- DIFF END: Object $OBJ_ID ---\n" + DIFF_FOUND=1 + fi + + + rm "$TEMP_V1" "$TEMP_V2" + done < <(echo "$RAW_MODIFIED_OBJECTS") + + + ) | tee -a "$LOG_FILE" + + if [ ${PIPESTATUS[0]} -ne 0 ]; then + echo "Failed to process $FULL_PATH (See $FAILURE_LOG and $LOG_FILE)" + fi +done +echo "----------------------------------------------------" | tee -a "$LOG_FILE" +echo "Scan complete. Check $LOG_FILE for inline object diffs." | tee -a "$LOG_FILE" +echo "Check $FAILURE_LOG for files that failed processing." | tee -a "$LOG_FILE"