#!/bin/bash
# MICT.GOUV.HT WordPress Uploads Downloader
# Compatible with Git Bash (no grep -P)

BASE_URL="https://mict.gouv.ht/wp-content/uploads"
OUTPUT_DIR="C:/Users/Squir/Desktop/HAITI/DUMP/MICT-GOUV/uploads"
MANIFEST="C:/Users/Squir/Desktop/HAITI/DUMP/MICT-GOUV/manifest.txt"
UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"

# Counters
TOTAL_FILES=0
DOWNLOADED=0
SKIPPED=0
FAILED=0
IMAGES_NOTED=0

# Initialize manifest
cat > "$MANIFEST" <<'HEADER'
=============================================
MICT.GOUV.HT WordPress Uploads Manifest
Haiti Ministry of Interior (MICT)
=============================================

HEADER
echo "Generated: $(date)" >> "$MANIFEST"
echo "" >> "$MANIFEST"

fetch_listing() {
    curl -sL --connect-timeout 15 --max-time 60 -H "User-Agent: $UA" "$1" 2>/dev/null
}

# Extract filenames from href attributes (no grep -P needed)
parse_files_from_html() {
    local html="$1"
    # Use sed to extract href values, then filter out directories and navigation
    echo "$html" | sed -n 's/.*href="\([^"?]*\)".*/\1/p' | grep -v '/$' | grep -v '^/' | grep -v 'Parent'
}

# Extract size from the same HTML row as the filename
get_file_size() {
    local html="$1"
    local filename="$2"
    # Find the line with this filename and extract size
    echo "$html" | grep -F "\"${filename}\"" | sed -n 's/.*align="right">\s*\([0-9.]*[KMG]*\).*/\1/p' | tail -1
}

is_document() {
    local ext="$1"
    case "$ext" in
        pdf|doc|docx|xls|xlsx|csv|txt|ppt|pptx|odt|ods|rtf|PDF|DOC|DOCX|XLS|XLSX|CSV|TXT|PPT|PPTX|ODT|ODS|RTF)
            return 0 ;;
        *)
            return 1 ;;
    esac
}

is_image() {
    local ext="$1"
    case "$ext" in
        jpg|jpeg|png|gif|bmp|webp|svg|JPG|JPEG|PNG|GIF|BMP|WEBP|SVG)
            return 0 ;;
        *)
            return 1 ;;
    esac
}

process_month() {
    local year="$1"
    local month="$2"
    local url="${BASE_URL}/${year}/${month}/"
    local outdir="${OUTPUT_DIR}/${year}/${month}"

    echo ""
    echo "=========================================="
    echo "Processing: ${year}/${month}"
    echo "=========================================="

    local html
    html=$(fetch_listing "$url")

    if [ -z "$html" ]; then
        echo "  [WARN] Empty response"
        return
    fi

    if ! echo "$html" | grep -q "Index of"; then
        echo "  [WARN] Not a directory listing"
        return
    fi

    echo "" >> "$MANIFEST"
    echo "## ${year}/${month}" >> "$MANIFEST"

    local files
    files=$(parse_files_from_html "$html")

    if [ -z "$files" ]; then
        echo "  [INFO] No files found"
        echo "  (empty)" >> "$MANIFEST"
        return
    fi

    local month_doc_count=0
    local month_img_count=0

    while IFS= read -r filename; do
        [ -z "$filename" ] && continue
        TOTAL_FILES=$((TOTAL_FILES + 1))

        local file_url="${url}${filename}"
        # Get extension
        local ext="${filename##*.}"

        # Get size from listing
        local size_info
        size_info=$(get_file_size "$html" "$filename")
        [ -z "$size_info" ] && size_info="?"

        if is_document "$ext"; then
            month_doc_count=$((month_doc_count + 1))
            echo "  [DOC] $filename ($size_info)"
            echo "  [DOC] $filename | $size_info | $file_url" >> "$MANIFEST"

            mkdir -p "$outdir"
            local outfile="${outdir}/${filename}"

            if [ -f "$outfile" ]; then
                echo "    -> Already exists"
                SKIPPED=$((SKIPPED + 1))
            else
                curl -sL --connect-timeout 15 --max-time 120 \
                    -H "User-Agent: $UA" \
                    -o "$outfile" \
                    "$file_url"

                if [ -f "$outfile" ] && [ -s "$outfile" ]; then
                    local actual_size=$(du -h "$outfile" | cut -f1)
                    echo "    -> Downloaded ($actual_size)"
                    echo "    DOWNLOADED ($actual_size)" >> "$MANIFEST"
                    DOWNLOADED=$((DOWNLOADED + 1))
                else
                    echo "    -> FAILED"
                    echo "    FAILED" >> "$MANIFEST"
                    rm -f "$outfile" 2>/dev/null
                    FAILED=$((FAILED + 1))
                fi
            fi

        elif is_image "$ext"; then
            IMAGES_NOTED=$((IMAGES_NOTED + 1))
            month_img_count=$((month_img_count + 1))
            echo "  [IMG] $filename ($size_info)" >> "$MANIFEST"

            # Skip WordPress thumbnail variants (contain dimension patterns like -150x150, -300x300)
            if echo "$filename" | grep -qE '(-[0-9]+x[0-9]+\.)'; then
                continue
            fi

            # Download small images only (K-sized)
            if echo "$size_info" | grep -qE '^[0-9.]+K$'; then
                mkdir -p "$outdir"
                local outfile="${outdir}/${filename}"
                if [ ! -f "$outfile" ]; then
                    curl -sL --connect-timeout 15 --max-time 60 \
                        -H "User-Agent: $UA" \
                        -o "$outfile" \
                        "$file_url"
                    [ -f "$outfile" ] && [ -s "$outfile" ] && echo "    -> Downloaded small image"
                fi
            fi
        else
            echo "  [OTHER] $filename ($size_info)" >> "$MANIFEST"
        fi
    done <<< "$files"

    echo "  Summary: ${month_doc_count} documents, ${month_img_count} images"
}

echo "================================================"
echo "MICT.GOUV.HT WordPress Uploads Crawler"
echo "Target: 2023, 2024, 2025"
echo "================================================"

for year in 2023 2024 2025; do
    echo ""
    echo "###############################################"
    echo "# YEAR: $year"
    echo "###############################################"
    for month in 01 02 03 04 05 06 07 08 09 10 11 12; do
        process_month "$year" "$month"
    done
done

echo ""
echo "============================================="
echo "CRAWL COMPLETE"
echo "============================================="
echo "Total files found:    $TOTAL_FILES"
echo "Documents downloaded: $DOWNLOADED"
echo "Skipped (existing):   $SKIPPED"
echo "Failed downloads:     $FAILED"
echo "Images noted:         $IMAGES_NOTED"
echo "============================================="

cat >> "$MANIFEST" <<EOF

=============================================
SUMMARY
=============================================
Total files found:    $TOTAL_FILES
Documents downloaded: $DOWNLOADED
Skipped (existing):   $SKIPPED
Failed downloads:     $FAILED
Images noted:         $IMAGES_NOTED
=============================================
EOF

echo "Manifest: $MANIFEST"
echo "Downloads: $OUTPUT_DIR"
