#!/bin/bash
# Full recursive mirror of repodatos.atdt.gob.mx
# WARNING: This will download ~50 GB of data
# Run from toolbox (CT 105) or any Linux box with curl + python3
# Usage: bash repodatos-full-mirror.sh [output_dir]

OUT="${1:-/opt/repodatos/mirror}"
mkdir -p "$OUT"

BASE="https://repodatos.atdt.gob.mx"
LOG="$OUT/mirror.log"

echo "=== Full mirror of $BASE ===" | tee "$LOG"
echo "=== $(date) ===" | tee -a "$LOG"
echo "=== WARNING: Estimated 50+ GB ===" | tee -a "$LOG"

mirror_dir() {
    local url="$1"
    local local_path="$2"
    local depth="$3"

    if [ "$depth" -gt 8 ]; then
        echo "MAX_DEPTH: $url" >> "$LOG"
        return
    fi

    mkdir -p "$local_path"

    local json
    json=$(curl -skL --max-time 30 "$url" 2>/dev/null)
    if [ -z "$json" ]; then
        echo "FETCH_FAIL: $url" >> "$LOG"
        return
    fi

    echo "$json" | python3 -c "
import json, sys
try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
        for item in data:
            print(item.get('type','') + '|' + item.get('name','') + '|' + str(item.get('size',0)))
except:
    pass
" 2>/dev/null | while IFS='|' read -r ftype fname fsize; do
        [ -z "$fname" ] && continue

        if [ "$ftype" = "directory" ]; then
            echo "[DIR] $local_path/$fname/" | tee -a "$LOG"
            mirror_dir "${url}${fname}/" "$local_path/$fname" $((depth + 1))
        else
            local target="$local_path/$fname"
            if [ -f "$target" ]; then
                local existing_size=$(stat -c%s "$target" 2>/dev/null || echo 0)
                if [ "$existing_size" = "$fsize" ]; then
                    echo "[SKIP] $target (already exists, same size)" >> "$LOG"
                    continue
                fi
            fi
            echo "[DL] $target ($((fsize/1048576))MB)" | tee -a "$LOG"
            curl -skL --max-time 1200 -o "$target" "${url}${fname}" 2>/dev/null
            echo "[OK] $target" >> "$LOG"
        fi
    done
}

mirror_dir "$BASE/" "$OUT" 0

echo "" | tee -a "$LOG"
echo "=== MIRROR COMPLETE ===" | tee -a "$LOG"
echo "=== $(date) ===" | tee -a "$LOG"
du -sh "$OUT" | tee -a "$LOG"
