#!/bin/bash
# MICT.GOUV.HT — Full Uploads Crawler (ALL YEARS)
# Git Bash compatible (no grep -P)

BASE_URL="https://mict.gouv.ht/wp-content/uploads"
OUTPUT_DIR="C:/Users/Squir/Desktop/HAITI/DUMP/MICT-GOUV/uploads"
MANIFEST="C:/Users/Squir/Desktop/HAITI/DUMP/MICT-GOUV/manifest.txt"
UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"

TOTAL_FILES=0
DOC_DOWNLOADED=0
DOC_SKIPPED=0
DOC_FAILED=0
IMG_NOTED=0
IMG_DOWNLOADED=0

cat > "$MANIFEST" <<HEADER
=============================================
MICT.GOUV.HT — Complete WordPress Uploads Manifest
Haiti Ministry of Interior (MICT)
Generated: $(date)
Source: ${BASE_URL}
Years: 2013-2025 (all available)
=============================================

HEADER

fetch() {
    curl -sL --connect-timeout 15 --max-time 60 -H "User-Agent: $UA" "$1" 2>/dev/null
}

# Extract filenames from Apache directory listing href attributes
# Uses sed only — no grep -P
extract_files() {
    sed -n 's/.*href="\([^"?][^"]*\)".*/\1/p' | grep -v '/$' | grep -v '^/' | grep -v 'Parent'
}

# Extract full table rows with filename, date, and size
extract_file_row() {
    local html="$1"
    local fname="$2"
    echo "$html" | grep -F "\"${fname}\"" | sed 's/<[^>]*>//g' | tr -s ' '
}

is_doc_ext() {
    case "$(echo "$1" | tr '[:upper:]' '[:lower:]')" in
        pdf|doc|docx|xls|xlsx|csv|txt|ppt|pptx|odt|ods|rtf) return 0 ;;
        *) return 1 ;;
    esac
}

is_img_ext() {
    case "$(echo "$1" | tr '[:upper:]' '[:lower:]')" in
        jpg|jpeg|png|gif|bmp|webp|svg) return 0 ;;
        *) return 1 ;;
    esac
}

# Check if image is a WordPress thumbnail (e.g., image-150x150.jpg)
is_thumbnail() {
    echo "$1" | grep -qE '\-[0-9]+x[0-9]+\.'
}

process_month() {
    local year="$1" month="$2"
    local url="${BASE_URL}/${year}/${month}/"
    local outdir="${OUTPUT_DIR}/${year}/${month}"

    local html
    html=$(fetch "$url")
    [ -z "$html" ] && return
    echo "$html" | grep -q "Index of" || return

    local files
    files=$(echo "$html" | extract_files)
    [ -z "$files" ] && return

    local doc_count=0 img_count=0

    while IFS= read -r fname; do
        [ -z "$fname" ] && continue
        TOTAL_FILES=$((TOTAL_FILES + 1))

        local furl="${url}${fname}"
        local ext="${fname##*.}"

        # Get size from HTML row
        local size
        size=$(echo "$html" | grep -F "\"${fname}\"" | sed 's/<[^>]*>//g' | tr -s ' ' | grep -oE '[0-9.]+[KMG]' | tail -1)
        [ -z "$size" ] && size="?"

        if is_doc_ext "$ext"; then
            doc_count=$((doc_count + 1))
            echo "  [DOC] $fname ($size)"
            echo "  [DOC] $fname | $size | $furl" >> "$MANIFEST"

            mkdir -p "$outdir"
            local outfile="${outdir}/${fname}"
            if [ -f "$outfile" ] && [ -s "$outfile" ]; then
                echo "    -> exists"
                DOC_SKIPPED=$((DOC_SKIPPED + 1))
            else
                curl -sL --connect-timeout 15 --max-time 180 \
                    -H "User-Agent: $UA" \
                    -o "$outfile" "$furl"
                if [ -f "$outfile" ] && [ -s "$outfile" ]; then
                    local sz=$(du -h "$outfile" | cut -f1)
                    echo "    -> OK ($sz)"
                    echo "    DOWNLOADED ($sz)" >> "$MANIFEST"
                    DOC_DOWNLOADED=$((DOC_DOWNLOADED + 1))
                else
                    echo "    -> FAIL"
                    echo "    FAILED" >> "$MANIFEST"
                    rm -f "$outfile" 2>/dev/null
                    DOC_FAILED=$((DOC_FAILED + 1))
                fi
            fi

        elif is_img_ext "$ext"; then
            IMG_NOTED=$((IMG_NOTED + 1))
            img_count=$((img_count + 1))

            # Skip thumbnails
            is_thumbnail "$fname" && continue

            echo "  [IMG] $fname ($size)" >> "$MANIFEST"

            # Download small (K-size) original images
            if echo "$size" | grep -qE '^[0-9.]+K$'; then
                mkdir -p "$outdir"
                local outfile="${outdir}/${fname}"
                if [ ! -f "$outfile" ]; then
                    curl -sL --connect-timeout 15 --max-time 60 \
                        -H "User-Agent: $UA" \
                        -o "$outfile" "$furl"
                    if [ -f "$outfile" ] && [ -s "$outfile" ]; then
                        IMG_DOWNLOADED=$((IMG_DOWNLOADED + 1))
                    fi
                fi
            fi
        else
            echo "  [OTHER] $fname ($size)" >> "$MANIFEST"
        fi
    done <<< "$files"

    [ $doc_count -gt 0 ] || [ $img_count -gt 0 ] && \
        echo "  => ${year}/${month}: ${doc_count} docs, ${img_count} images"
}

echo "================================================"
echo "MICT.GOUV.HT Full Uploads Crawler"
echo "================================================"

# Crawl ALL available years
for year in 2013 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025; do
    echo ""
    echo "=== YEAR: $year ==="
    echo "" >> "$MANIFEST"
    echo "========== YEAR: $year ==========" >> "$MANIFEST"

    # First get month list for this year
    local_html=$(fetch "${BASE_URL}/${year}/")
    if [ -z "$local_html" ]; then
        echo "  [WARN] Cannot reach ${year}/"
        continue
    fi

    months=$(echo "$local_html" | extract_files | grep -E '^[0-9]+$' || true)
    # Also try with trailing content stripped
    months2=$(echo "$local_html" | sed -n 's/.*href="\([0-9][0-9]*\)\/".*/\1/p')

    if [ -z "$months2" ]; then
        echo "  No months found"
        continue
    fi

    for month in $months2; do
        # Zero-pad if needed
        month=$(printf "%02d" "$month")
        process_month "$year" "$month"
    done
done

echo ""
echo "============================================="
echo "COMPLETE"
echo "============================================="
echo "Total files scanned:    $TOTAL_FILES"
echo "Documents downloaded:   $DOC_DOWNLOADED"
echo "Documents skipped:      $DOC_SKIPPED"
echo "Documents failed:       $DOC_FAILED"
echo "Images noted:           $IMG_NOTED"
echo "Images downloaded:      $IMG_DOWNLOADED"
echo "============================================="

cat >> "$MANIFEST" <<EOF

=============================================
FINAL SUMMARY
=============================================
Total files scanned:    $TOTAL_FILES
Documents downloaded:   $DOC_DOWNLOADED
Documents skipped:      $DOC_SKIPPED
Documents failed:       $DOC_FAILED
Images noted:           $IMG_NOTED
Images downloaded:      $IMG_DOWNLOADED
=============================================
EOF

echo ""
echo "Manifest: $MANIFEST"
echo "Downloads: $OUTPUT_DIR"
