Add scripts

This commit is contained in:
libroot 2026-01-17 20:18:15 +00:00
commit cc682741d5
6 changed files with 437 additions and 0 deletions

45
extract_embedded_files.sh Normal file
View file

@ -0,0 +1,45 @@
#!/bin/bash
# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
#
# deps: pdfdetach (poppler utils)
OUTPUT_DIR="./extracted_attachments"
LOG_FILE="attachment_extraction.log"
mkdir -p "$OUTPUT_DIR"
> "$LOG_FILE"
echo "----------------------------------------"
echo "Starting attachment extraction scan..."
echo "Log: $LOG_FILE"
echo "Output directory: $OUTPUT_DIR"
echo "----------------------------------------"
find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
REL_PATH="${PDF_FILE#./}"
SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
if [ "$ATTACH_COUNT" -gt 0 ]; then
echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
mkdir -p "$FILE_OUTPUT_DIR"
pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
echo "" >> "$LOG_FILE"
fi
fi
done
echo "----------------------------------------"
echo "Extraction complete. Check $LOG_FILE for details."