Add scripts
This commit is contained in:
commit
cc682741d5
6 changed files with 437 additions and 0 deletions
1
README
Normal file
1
README
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
Scripts for PDF forensics
|
||||||
88
extract_annots.py
Normal file
88
extract_annots.py
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
extract all annotations from PDFs recursively
|
||||||
|
"""
|
||||||
|
import fitz
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def extract_annotations(pdf_path):
|
||||||
|
try:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
annotations = []
|
||||||
|
|
||||||
|
for page_num, page in enumerate(doc, start=1):
|
||||||
|
for annot in page.annots():
|
||||||
|
annot_type = annot.type[1] if annot.type else 'Unknown'
|
||||||
|
|
||||||
|
annot_data = {
|
||||||
|
'page': page_num,
|
||||||
|
'type': annot_type,
|
||||||
|
'content': annot.info.get('content', '').strip(),
|
||||||
|
'subject': annot.info.get('subject', '').strip(),
|
||||||
|
'title': annot.info.get('title', '').strip(),
|
||||||
|
'created': annot.info.get('creationDate', ''),
|
||||||
|
'modified': annot.info.get('modDate', ''),
|
||||||
|
'rect': list(annot.rect),
|
||||||
|
}
|
||||||
|
|
||||||
|
annotations.append(annot_data)
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
if annotations:
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'annotation_count': len(annotations),
|
||||||
|
'annotations': annotations
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf_files = Path('.').rglob('*.pdf')
|
||||||
|
found_count = 0
|
||||||
|
|
||||||
|
print(f"{'SCANNING FOR ANNOTATIONS':=^60}")
|
||||||
|
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
result = extract_annotations(pdf_file)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
if 'error' in result:
|
||||||
|
# print(f"Error scanning {result['file']}: {result['error']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
found_count += 1
|
||||||
|
print(f"\nFile: {result['file']}")
|
||||||
|
print(f" Total Annotations: {result['annotation_count']}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for annot in result['annotations']:
|
||||||
|
header = f"[Page {annot['page']} | {annot['type']}]"
|
||||||
|
print(f" {header}")
|
||||||
|
|
||||||
|
if annot['content']:
|
||||||
|
print(f" Content: {annot['content']}")
|
||||||
|
|
||||||
|
if annot['title']:
|
||||||
|
print(f" Author: {annot['title']}")
|
||||||
|
|
||||||
|
if annot['subject'] and annot['subject'] != annot['content']:
|
||||||
|
print(f" Subject: {annot['subject']}")
|
||||||
|
|
||||||
|
if annot['modified']:
|
||||||
|
print(f" Date: {annot['modified']}")
|
||||||
|
|
||||||
|
print("")
|
||||||
|
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
print(f"\nExtraction complete. Found {found_count} PDFs with annotations.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
45
extract_embedded_files.sh
Normal file
45
extract_embedded_files.sh
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
|
||||||
|
#
|
||||||
|
# deps: pdfdetach (poppler utils)
|
||||||
|
|
||||||
|
OUTPUT_DIR="./extracted_attachments"
|
||||||
|
LOG_FILE="attachment_extraction.log"
|
||||||
|
|
||||||
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
> "$LOG_FILE"
|
||||||
|
|
||||||
|
echo "----------------------------------------"
|
||||||
|
echo "Starting attachment extraction scan..."
|
||||||
|
echo "Log: $LOG_FILE"
|
||||||
|
echo "Output directory: $OUTPUT_DIR"
|
||||||
|
echo "----------------------------------------"
|
||||||
|
|
||||||
|
find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
|
||||||
|
REL_PATH="${PDF_FILE#./}"
|
||||||
|
SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
|
||||||
|
FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
|
||||||
|
|
||||||
|
echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
|
||||||
|
ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
|
||||||
|
|
||||||
|
if [ "$ATTACH_COUNT" -gt 0 ]; then
|
||||||
|
echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
mkdir -p "$FILE_OUTPUT_DIR"
|
||||||
|
|
||||||
|
pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
|
||||||
|
echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
|
||||||
|
echo "" >> "$LOG_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "----------------------------------------"
|
||||||
|
echo "Extraction complete. Check $LOG_FILE for details."
|
||||||
103
extract_form_fields.py
Normal file
103
extract_form_fields.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
extract form field data from PDFs recursively
|
||||||
|
"""
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def extract_form_fields(pdf_path):
|
||||||
|
try:
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
|
||||||
|
if reader.is_encrypted:
|
||||||
|
return {'file': str(pdf_path), 'error': 'File is encrypted'}
|
||||||
|
|
||||||
|
fields = reader.get_fields()
|
||||||
|
|
||||||
|
if fields:
|
||||||
|
form_data = []
|
||||||
|
|
||||||
|
for field_name, field_info in fields.items():
|
||||||
|
if not field_info:
|
||||||
|
continue
|
||||||
|
|
||||||
|
field_entry = {
|
||||||
|
'name': field_name,
|
||||||
|
'value': field_info.get('/V', ''),
|
||||||
|
'default_value': field_info.get('/DV', ''),
|
||||||
|
'type': field_info.get('/FT', ''),
|
||||||
|
'flags': field_info.get('/Ff', 0),
|
||||||
|
'read_only': bool(field_info.get('/Ff', 0) & 1),
|
||||||
|
'required': bool(field_info.get('/Ff', 0) & 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
# alternate field name if present
|
||||||
|
if '/T' in field_info:
|
||||||
|
field_entry['name'] = field_info['/T']
|
||||||
|
|
||||||
|
# tooltip/description
|
||||||
|
if '/TU' in field_info:
|
||||||
|
field_entry['tooltip'] = field_info['/TU']
|
||||||
|
|
||||||
|
form_data.append(field_entry)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'field_count': len(form_data),
|
||||||
|
'fields': form_data
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf_files = Path('.').rglob('*.pdf')
|
||||||
|
found_count = 0
|
||||||
|
|
||||||
|
print(f"{'SCANNING FOR FORM FIELDS':=^60}")
|
||||||
|
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
result = extract_form_fields(pdf_file)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
if 'error' in result:
|
||||||
|
# print(f"\nFile: {result['file']}")
|
||||||
|
# print(f" [!] Error: {result['error']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
found_count += 1
|
||||||
|
print(f"\nFile: {result['file']}")
|
||||||
|
print(f" Total Fields: {result['field_count']}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for field in result['fields']:
|
||||||
|
# clean up type string (e.g. '/Tx' -> 'Tx')
|
||||||
|
f_type = str(field['type']).replace('/', '') if field['type'] else 'UNK'
|
||||||
|
|
||||||
|
val = field['value']
|
||||||
|
display_val = f"'{val}'" if val else "<empty>"
|
||||||
|
|
||||||
|
status = []
|
||||||
|
if field['required']: status.append("REQ")
|
||||||
|
if field['read_only']: status.append("RO")
|
||||||
|
status_str = f"[{', '.join(status)}]" if status else ""
|
||||||
|
|
||||||
|
print(f" [{f_type:<3}] {field['name']}")
|
||||||
|
print(f" Value: {display_val}")
|
||||||
|
|
||||||
|
if status_str:
|
||||||
|
print(f" Flags: {status_str}")
|
||||||
|
if field.get('tooltip'):
|
||||||
|
print(f" Tip: {field['tooltip']}")
|
||||||
|
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
print(f"\nExtraction complete. Found {found_count} PDFs with form fields.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
69
extract_layers.py
Normal file
69
extract_layers.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
analyze Optional Content Groups (Layers) in PDFs recursively
|
||||||
|
"""
|
||||||
|
import fitz
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def analyze_ocg(pdf_path):
|
||||||
|
try:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
ocgs = []
|
||||||
|
|
||||||
|
if doc.is_pdf and hasattr(doc, 'get_ocgs'):
|
||||||
|
ocg_list = doc.get_ocgs()
|
||||||
|
if ocg_list:
|
||||||
|
for ocg in ocg_list:
|
||||||
|
ocgs.append({
|
||||||
|
'name': ocg.get('name', 'Unknown'),
|
||||||
|
'on': ocg.get('on', None),
|
||||||
|
'intent': ocg.get('intent', []),
|
||||||
|
'usage': ocg.get('usage', {})
|
||||||
|
})
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
if ocgs:
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'ocg_count': len(ocgs),
|
||||||
|
'layers': ocgs
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
'file': str(pdf_path),
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf_files = Path('.').rglob('*.pdf')
|
||||||
|
found_count = 0
|
||||||
|
|
||||||
|
print(f"{'SCANNING FOR LAYERS':=^60}")
|
||||||
|
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
result = analyze_ocg(pdf_file)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
found_count += 1
|
||||||
|
print(f"\nFile: {result['file']}")
|
||||||
|
|
||||||
|
if 'error' in result:
|
||||||
|
print(f" [!] Error: {result['error']}")
|
||||||
|
print("-" * 60)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" Layer Count: {result['ocg_count']}")
|
||||||
|
print(" Layers:")
|
||||||
|
|
||||||
|
for layer in result['layers']:
|
||||||
|
state = "ON " if layer['on'] else "OFF"
|
||||||
|
print(f" [{state}] {layer['name']}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
print(f"\nAnalysis complete. Found {found_count} PDFs with layers or errors.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
131
extract_version_diffs.sh
Normal file
131
extract_version_diffs.sh
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# note that this tool uses pdfresurrect -w, which creates version directories
|
||||||
|
# and extracts version files. to avoid modifying your directory structure, it's
|
||||||
|
# a good idea to run this script in a copy of your project directory.
|
||||||
|
#
|
||||||
|
# deps: qpdf, pdfresurrect
|
||||||
|
|
||||||
|
LOG_FILE="./resurrection_log.txt"
|
||||||
|
FAILURE_LOG="./resurrection_failures.log"
|
||||||
|
|
||||||
|
echo "--- Starting PDF resurrection scan ---" | tee "$LOG_FILE"
|
||||||
|
echo "Diffs of modified objects will be logged directly to: $LOG_FILE" | tee -a "$LOG_FILE"
|
||||||
|
echo "--------------------------------------" | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
# reset failure log
|
||||||
|
> "$FAILURE_LOG"
|
||||||
|
|
||||||
|
find . -type f -iname "*.pdf" -not -iname "*_text.pdf" -not -path "*/*-versions/*" | while read -r FULL_PATH; do
|
||||||
|
|
||||||
|
FILE_DIR=$(dirname "$FULL_PATH")
|
||||||
|
FILE_NAME=$(basename "$FULL_PATH")
|
||||||
|
BASE_NAME="${FILE_NAME%.*}" # filename w/o ext
|
||||||
|
|
||||||
|
echo "Checking: $FULL_PATH"
|
||||||
|
|
||||||
|
(
|
||||||
|
cd "$FILE_DIR" || { echo "ERROR: Cannot access $FILE_DIR" >> "$FAILURE_LOG"; exit 1; }
|
||||||
|
|
||||||
|
VERSION_COUNT_OUTPUT=$(pdfresurrect -q "$FILE_NAME" 2>/dev/null)
|
||||||
|
EXIT_CODE=$?
|
||||||
|
|
||||||
|
if [ "$EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Failed to run pdfresurrect -q on $FULL_PATH (Exit $EXIT_CODE)." >> "$FAILURE_LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
VERSION_COUNT=$(echo "$VERSION_COUNT_OUTPUT" | grep -oP ': \K\d+' | tail -n 1)
|
||||||
|
|
||||||
|
if [ "$VERSION_COUNT" -le 1 ]; then
|
||||||
|
echo " -> No history found ($VERSION_COUNT version(s))."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " -> History detected ($VERSION_COUNT versions). Extracting..."
|
||||||
|
|
||||||
|
pdfresurrect -w "$FILE_NAME" > /dev/null 2>&1
|
||||||
|
|
||||||
|
SUMMARY_FILE="$BASE_NAME-versions/$BASE_NAME-versions.summary"
|
||||||
|
|
||||||
|
if [ ! -f "$SUMMARY_FILE" ]; then
|
||||||
|
echo " -> CRITICAL ERROR: Summary file not created at $SUMMARY_FILE, skipping diff."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
RAW_MODIFIED_OBJECTS=$(awk '/: --[DM]--/ { print $4, $7 }' "$SUMMARY_FILE" | sort -u)
|
||||||
|
|
||||||
|
echo " -> Raw D/M object list (V OBJ):"
|
||||||
|
echo "$RAW_MODIFIED_OBJECTS" | while read -r line; do
|
||||||
|
[ -n "$line" ] && echo " - $line"
|
||||||
|
done
|
||||||
|
|
||||||
|
while IFS= read -r line ; do
|
||||||
|
CURRENT_VERSION=$(echo "$line" | awk '{print $1}')
|
||||||
|
OBJ_ID=$(echo "$line" | awk '{print $2}')
|
||||||
|
|
||||||
|
if [ -z "$CURRENT_VERSION" ] || [ -z "$OBJ_ID" ]; then
|
||||||
|
[ -n "$line" ] && echo " -> WARNING: Parsed empty version/object pair from line: \"$line\". Skipping."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [[ "$CURRENT_VERSION" =~ ^[0-9]+$ ]] || ! [[ "$OBJ_ID" =~ ^[0-9]+$ ]]; then
|
||||||
|
echo " -> ERROR: Parsed invalid numbers ($CURRENT_VERSION : $OBJ_ID) from summary. Skipping."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$CURRENT_VERSION" -eq 1 ]; then
|
||||||
|
echo " -> Skipping Object $OBJ_ID: Current version is V1, no V0 exists to diff against."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
PREV_VERSION=$((CURRENT_VERSION - 1))
|
||||||
|
|
||||||
|
V1_FILE="$BASE_NAME-versions/$BASE_NAME-version-$PREV_VERSION.pdf"
|
||||||
|
V2_FILE="$BASE_NAME-versions/$BASE_NAME-version-$CURRENT_VERSION.pdf"
|
||||||
|
|
||||||
|
TEMP_V1=$(mktemp)
|
||||||
|
TEMP_V2=$(mktemp)
|
||||||
|
|
||||||
|
qpdf --show-object="$OBJ_ID" --unfilter -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
|
||||||
|
qpdf --show-object="$OBJ_ID" --unfilter -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
|
||||||
|
|
||||||
|
if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
|
||||||
|
if [ ! -s "$TEMP_V1" ]; then
|
||||||
|
qpdf --show-object="$OBJ_ID" -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -s "$TEMP_V2" ]; then
|
||||||
|
qpdf --show-object="$OBJ_ID" -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
|
||||||
|
echo " -> NOTE: Object $OBJ_ID (V$PREV_VERSION or V$CURRENT_VERSION) extraction failed or produced empty output. Skipping diff."
|
||||||
|
rm "$TEMP_V1" "$TEMP_V2"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if diff -u "$TEMP_V1" "$TEMP_V2" >/dev/null ; then
|
||||||
|
echo " -> Object $OBJ_ID: No difference detected (V$PREV_VERSION vs V$CURRENT_VERSION)."
|
||||||
|
else
|
||||||
|
echo -e "\n\n--- DIFF START: $FULL_PATH | Object $OBJ_ID (V$PREV_VERSION vs V$CURRENT_VERSION) ---"
|
||||||
|
echo "Note: Output extracted as raw PDF content for reliable comparison."
|
||||||
|
diff -u "$TEMP_V1" "$TEMP_V2"
|
||||||
|
echo "--- DIFF END: Object $OBJ_ID ---\n"
|
||||||
|
DIFF_FOUND=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
rm "$TEMP_V1" "$TEMP_V2"
|
||||||
|
done < <(echo "$RAW_MODIFIED_OBJECTS")
|
||||||
|
|
||||||
|
|
||||||
|
) | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
||||||
|
echo "Failed to process $FULL_PATH (See $FAILURE_LOG and $LOG_FILE)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "----------------------------------------------------" | tee -a "$LOG_FILE"
|
||||||
|
echo "Scan complete. Check $LOG_FILE for inline object diffs." | tee -a "$LOG_FILE"
|
||||||
|
echo "Check $FAILURE_LOG for files that failed processing." | tee -a "$LOG_FILE"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue