45 lines
1.4 KiB
Bash
45 lines
1.4 KiB
Bash
#!/bin/bash
|
|
# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
|
|
#
|
|
# deps: pdfdetach (poppler utils)
|
|
|
|
OUTPUT_DIR="./extracted_attachments"
|
|
LOG_FILE="attachment_extraction.log"
|
|
|
|
mkdir -p "$OUTPUT_DIR"
|
|
> "$LOG_FILE"
|
|
|
|
echo "----------------------------------------"
|
|
echo "Starting attachment extraction scan..."
|
|
echo "Log: $LOG_FILE"
|
|
echo "Output directory: $OUTPUT_DIR"
|
|
echo "----------------------------------------"
|
|
|
|
find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
|
|
REL_PATH="${PDF_FILE#./}"
|
|
SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
|
|
FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
|
|
|
|
echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
|
|
|
|
ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
|
|
|
|
if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
|
|
ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
|
|
|
|
if [ "$ATTACH_COUNT" -gt 0 ]; then
|
|
echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
|
|
|
|
mkdir -p "$FILE_OUTPUT_DIR"
|
|
|
|
pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
|
|
|
|
echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
|
|
echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
|
|
echo "" >> "$LOG_FILE"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
echo "----------------------------------------"
|
|
echo "Extraction complete. Check $LOG_FILE for details."
|