#!/bin/bash
# COMPLETE dump of repodatos.atdt.gob.mx — every file, every directory
# Recursive JSON directory crawler + downloader
# Usage: bash repodatos-dump-all.sh [output_dir]
# Skips files that already exist with matching size

DEST="${1:-/c/Users/Squir/Desktop/MEXICO/V A U L T/repodatos.atdt.gob.mx}"
BASE="https://repodatos.atdt.gob.mx"
LOG="$DEST/download.log"

echo "=== FULL DUMP: $BASE ===" | tee "$LOG"
echo "=== Dest: $DEST ===" | tee -a "$LOG"
echo "=== Started: $(date) ===" | tee -a "$LOG"

downloaded=0
skipped=0
failed=0
total_bytes=0

download_dir() {
    local url="$1"
    local local_dir="$2"
    local depth="$3"

    if [ "$depth" -gt 10 ]; then
        echo "MAX_DEPTH: $url" >> "$LOG"
        return
    fi

    mkdir -p "$local_dir"

    local json
    json=$(curl -skL --max-time 30 "$url" 2>/dev/null)
    if [ -z "$json" ]; then
        echo "FETCH_FAIL: $url" | tee -a "$LOG"
        failed=$((failed + 1))
        return
    fi

    # Parse JSON with python3
    local entries
    entries=$(echo "$json" | python3 -c "
import json, sys
try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
        for item in data:
            print(item.get('type','') + '|' + item.get('name','') + '|' + str(item.get('size',0)))
except:
    pass
" 2>/dev/null)

    if [ -z "$entries" ]; then
        echo "PARSE_FAIL: $url" >> "$LOG"
        failed=$((failed + 1))
        return
    fi

    while IFS='|' read -r ftype fname fsize; do
        [ -z "$fname" ] && continue

        if [ "$ftype" = "directory" ]; then
            echo "[DIR] $local_dir/$fname/" | tee -a "$LOG"
            download_dir "${url}${fname}/" "$local_dir/$fname" $((depth + 1))
        else
            local target="$local_dir/$fname"
            # Skip if exists and same size
            if [ -f "$target" ]; then
                local existing_size
                existing_size=$(stat --printf="%s" "$target" 2>/dev/null || wc -c < "$target" 2>/dev/null || echo 0)
                if [ "$existing_size" = "$fsize" ]; then
                    echo "[SKIP] $target (${fsize}B exists)" >> "$LOG"
                    skipped=$((skipped + 1))
                    continue
                fi
            fi
            local hr_size
            if [ "$fsize" -gt 1073741824 ] 2>/dev/null; then
                hr_size="$(echo "scale=1; $fsize / 1073741824" | bc 2>/dev/null || echo "$fsize")GB"
            elif [ "$fsize" -gt 1048576 ] 2>/dev/null; then
                hr_size="$(echo "scale=1; $fsize / 1048576" | bc 2>/dev/null || echo "$fsize")MB"
            else
                hr_size="${fsize}B"
            fi
            echo "[DL] $fname ($hr_size) -> $local_dir/" | tee -a "$LOG"
            curl -skL --max-time 1800 -o "$target" "${url}${fname}" 2>/dev/null
            if [ $? -eq 0 ]; then
                downloaded=$((downloaded + 1))
                total_bytes=$((total_bytes + fsize))
            else
                echo "[FAIL] $target" | tee -a "$LOG"
                failed=$((failed + 1))
            fi
        fi
    done <<< "$entries"
}

# Crawl everything from root
download_dir "$BASE/" "$DEST" 0

echo "" | tee -a "$LOG"
echo "===============================" | tee -a "$LOG"
echo "=== DUMP COMPLETE ===" | tee -a "$LOG"
echo "=== Finished: $(date) ===" | tee -a "$LOG"
echo "Downloaded: $downloaded files" | tee -a "$LOG"
echo "Skipped:    $skipped files (already exist)" | tee -a "$LOG"
echo "Failed:     $failed" | tee -a "$LOG"
echo "===============================" | tee -a "$LOG"
echo "" | tee -a "$LOG"
echo "Disk usage:" | tee -a "$LOG"
du -sh "$DEST" | tee -a "$LOG"
echo "" | tee -a "$LOG"
echo "By directory:" | tee -a "$LOG"
du -sh "$DEST"/*/ 2>/dev/null | tee -a "$LOG"
