Add scripts
This commit is contained in:
commit
cc682741d5
6 changed files with 437 additions and 0 deletions
45
extract_embedded_files.sh
Normal file
45
extract_embedded_files.sh
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
# extracts embedded files from all PDFs recursively to a $OUTPUT_DIR dir
|
||||
#
|
||||
# deps: pdfdetach (poppler utils)
|
||||
|
||||
OUTPUT_DIR="./extracted_attachments"
|
||||
LOG_FILE="attachment_extraction.log"
|
||||
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
> "$LOG_FILE"
|
||||
|
||||
echo "----------------------------------------"
|
||||
echo "Starting attachment extraction scan..."
|
||||
echo "Log: $LOG_FILE"
|
||||
echo "Output directory: $OUTPUT_DIR"
|
||||
echo "----------------------------------------"
|
||||
|
||||
find . -type f -iname "*.pdf" | while read -r PDF_FILE; do
|
||||
REL_PATH="${PDF_FILE#./}"
|
||||
SAFE_NAME=$(echo "$REL_PATH" | tr '/' '_' | sed 's/\.pdf$//')
|
||||
FILE_OUTPUT_DIR="$OUTPUT_DIR/$SAFE_NAME"
|
||||
|
||||
echo "Checking: $PDF_FILE" | tee -a "$LOG_FILE"
|
||||
|
||||
ATTACHMENT_LIST=$(pdfdetach -list "$PDF_FILE" 2>/dev/null)
|
||||
|
||||
if [ $? -eq 0 ] && [ -n "$ATTACHMENT_LIST" ]; then
|
||||
ATTACH_COUNT=$(echo "$ATTACHMENT_LIST" | tail -n +2 | wc -l)
|
||||
|
||||
if [ "$ATTACH_COUNT" -gt 0 ]; then
|
||||
echo " -> Found $ATTACH_COUNT attachment(s)" | tee -a "$LOG_FILE"
|
||||
|
||||
mkdir -p "$FILE_OUTPUT_DIR"
|
||||
|
||||
pdfdetach -saveall "$PDF_FILE" -o "$FILE_OUTPUT_DIR" 2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
echo "--- Attachments in: $PDF_FILE ---" >> "$LOG_FILE"
|
||||
echo "$ATTACHMENT_LIST" >> "$LOG_FILE"
|
||||
echo "" >> "$LOG_FILE"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "----------------------------------------"
|
||||
echo "Extraction complete. Check $LOG_FILE for details."
|
||||
Loading…
Add table
Add a link
Reference in a new issue