131 lines
5.1 KiB
Bash
131 lines
5.1 KiB
Bash
#!/bin/bash
|
|
#
|
|
# note that this tool uses pdfresurrect -w, which creates version directories
|
|
# and extracts version files. to avoid modifying your directory structure, it's
|
|
# a good idea to run this script in a copy of your project directory.
|
|
#
|
|
# deps: qpdf, pdfresurrect
|
|
|
|
LOG_FILE="./resurrection_log.txt"
|
|
FAILURE_LOG="./resurrection_failures.log"
|
|
|
|
echo "--- Starting PDF resurrection scan ---" | tee "$LOG_FILE"
|
|
echo "Diffs of modified objects will be logged directly to: $LOG_FILE" | tee -a "$LOG_FILE"
|
|
echo "--------------------------------------" | tee -a "$LOG_FILE"
|
|
|
|
# reset failure log
|
|
> "$FAILURE_LOG"
|
|
|
|
find . -type f -iname "*.pdf" -not -iname "*_text.pdf" -not -path "*/*-versions/*" | while read -r FULL_PATH; do
|
|
|
|
FILE_DIR=$(dirname "$FULL_PATH")
|
|
FILE_NAME=$(basename "$FULL_PATH")
|
|
BASE_NAME="${FILE_NAME%.*}" # filename w/o ext
|
|
|
|
echo "Checking: $FULL_PATH"
|
|
|
|
(
|
|
cd "$FILE_DIR" || { echo "ERROR: Cannot access $FILE_DIR" >> "$FAILURE_LOG"; exit 1; }
|
|
|
|
VERSION_COUNT_OUTPUT=$(pdfresurrect -q "$FILE_NAME" 2>/dev/null)
|
|
EXIT_CODE=$?
|
|
|
|
if [ "$EXIT_CODE" -ne 0 ]; then
|
|
echo "Failed to run pdfresurrect -q on $FULL_PATH (Exit $EXIT_CODE)." >> "$FAILURE_LOG"
|
|
exit 1
|
|
fi
|
|
|
|
VERSION_COUNT=$(echo "$VERSION_COUNT_OUTPUT" | grep -oP ': \K\d+' | tail -n 1)
|
|
|
|
if [ "$VERSION_COUNT" -le 1 ]; then
|
|
echo " -> No history found ($VERSION_COUNT version(s))."
|
|
exit 0
|
|
fi
|
|
|
|
echo " -> History detected ($VERSION_COUNT versions). Extracting..."
|
|
|
|
pdfresurrect -w "$FILE_NAME" > /dev/null 2>&1
|
|
|
|
SUMMARY_FILE="$BASE_NAME-versions/$BASE_NAME-versions.summary"
|
|
|
|
if [ ! -f "$SUMMARY_FILE" ]; then
|
|
echo " -> CRITICAL ERROR: Summary file not created at $SUMMARY_FILE, skipping diff."
|
|
exit 1
|
|
fi
|
|
|
|
RAW_MODIFIED_OBJECTS=$(awk '/: --[DM]--/ { print $4, $7 }' "$SUMMARY_FILE" | sort -u)
|
|
|
|
echo " -> Raw D/M object list (V OBJ):"
|
|
echo "$RAW_MODIFIED_OBJECTS" | while read -r line; do
|
|
[ -n "$line" ] && echo " - $line"
|
|
done
|
|
|
|
while IFS= read -r line ; do
|
|
CURRENT_VERSION=$(echo "$line" | awk '{print $1}')
|
|
OBJ_ID=$(echo "$line" | awk '{print $2}')
|
|
|
|
if [ -z "$CURRENT_VERSION" ] || [ -z "$OBJ_ID" ]; then
|
|
[ -n "$line" ] && echo " -> WARNING: Parsed empty version/object pair from line: \"$line\". Skipping."
|
|
continue
|
|
fi
|
|
|
|
if ! [[ "$CURRENT_VERSION" =~ ^[0-9]+$ ]] || ! [[ "$OBJ_ID" =~ ^[0-9]+$ ]]; then
|
|
echo " -> ERROR: Parsed invalid numbers ($CURRENT_VERSION : $OBJ_ID) from summary. Skipping."
|
|
continue
|
|
fi
|
|
|
|
if [ "$CURRENT_VERSION" -eq 1 ]; then
|
|
echo " -> Skipping Object $OBJ_ID: Current version is V1, no V0 exists to diff against."
|
|
continue
|
|
fi
|
|
|
|
PREV_VERSION=$((CURRENT_VERSION - 1))
|
|
|
|
V1_FILE="$BASE_NAME-versions/$BASE_NAME-version-$PREV_VERSION.pdf"
|
|
V2_FILE="$BASE_NAME-versions/$BASE_NAME-version-$CURRENT_VERSION.pdf"
|
|
|
|
TEMP_V1=$(mktemp)
|
|
TEMP_V2=$(mktemp)
|
|
|
|
qpdf --show-object="$OBJ_ID" --unfilter -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
|
|
qpdf --show-object="$OBJ_ID" --unfilter -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
|
|
|
|
if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
|
|
if [ ! -s "$TEMP_V1" ]; then
|
|
qpdf --show-object="$OBJ_ID" -- "$V1_FILE" > "$TEMP_V1" 2>/dev/null
|
|
fi
|
|
|
|
if [ ! -s "$TEMP_V2" ]; then
|
|
qpdf --show-object="$OBJ_ID" -- "$V2_FILE" > "$TEMP_V2" 2>/dev/null
|
|
fi
|
|
fi
|
|
if [ ! -s "$TEMP_V1" ] || [ ! -s "$TEMP_V2" ]; then
|
|
echo " -> NOTE: Object $OBJ_ID (V$PREV_VERSION or V$CURRENT_VERSION) extraction failed or produced empty output. Skipping diff."
|
|
rm "$TEMP_V1" "$TEMP_V2"
|
|
continue
|
|
fi
|
|
|
|
if diff -u "$TEMP_V1" "$TEMP_V2" >/dev/null ; then
|
|
echo " -> Object $OBJ_ID: No difference detected (V$PREV_VERSION vs V$CURRENT_VERSION)."
|
|
else
|
|
echo -e "\n\n--- DIFF START: $FULL_PATH | Object $OBJ_ID (V$PREV_VERSION vs V$CURRENT_VERSION) ---"
|
|
echo "Note: Output extracted as raw PDF content for reliable comparison."
|
|
diff -u "$TEMP_V1" "$TEMP_V2"
|
|
echo "--- DIFF END: Object $OBJ_ID ---\n"
|
|
DIFF_FOUND=1
|
|
fi
|
|
|
|
|
|
rm "$TEMP_V1" "$TEMP_V2"
|
|
done < <(echo "$RAW_MODIFIED_OBJECTS")
|
|
|
|
|
|
) | tee -a "$LOG_FILE"
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "Failed to process $FULL_PATH (See $FAILURE_LOG and $LOG_FILE)"
|
|
fi
|
|
done
|
|
echo "----------------------------------------------------" | tee -a "$LOG_FILE"
|
|
echo "Scan complete. Check $LOG_FILE for inline object diffs." | tee -a "$LOG_FILE"
|
|
echo "Check $FAILURE_LOG for files that failed processing." | tee -a "$LOG_FILE"
|