#!/bin/bash # extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir # # deps: pdfdetach (poppler utils) OUTPUT_DIR="./extracted_attachments" LOG_FILE="attachment_extraction.log" mkdir -p "$OUTPUT_DIR" > "$LOG_FILE" echo "----------------------------------------" echo "Starting attachment extraction scan..." echo "Log: $LOG_FILE" echo "Output directory: $OUTPUT_DIR" echo "----------------------------------------" find . -type f -iname "*.pdf" | while read -r PDF_FILE; do REL_PATH="${PDF_FILE#./}" SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//') FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME" echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE" ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null) if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l) if [ "$ATTACH_COUNT" -gt 0 ]; then echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE" mkdir -p "$FILE_OUTPUT_DIR" pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE" echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE" echo "$ATTACHMENT_LIST" >> "$LOG_FILE" echo "" >> "$LOG_FILE" fi fi done echo "----------------------------------------" echo "Extraction complete. Check $LOG_FILE for details."