#!/bin/bash
# Download PII-critical datasets from repodatos.atdt.gob.mx
# Run from toolbox (CT 105) or any Linux box with curl
# Usage: bash repodatos-download-pii.sh [output_dir]

OUT="${1:-/opt/repodatos/downloads}"
mkdir -p "$OUT/sinac" "$OUT/defunciones" "$OUT/educacion" "$OUT/inm" "$OUT/gobernacion" "$OUT/sesnsp" "$OUT/misc"

BASE="https://repodatos.atdt.gob.mx"

echo "=== Downloading PII-critical datasets from repodatos.atdt.gob.mx ==="
echo "=== $(date) ==="
echo "=== Output: $OUT ==="

# --- SINAC Birth Records (12.3 GB) ---
echo ""
echo ">>> SINAC Birth Records (16 files, ~12.3 GB) <<<"
for year in $(seq 2008 2019); do
    file="${year}_sinac${year}DatosAbiertos.csv"
    echo "  Downloading $file..."
    curl -skL --max-time 600 -o "$OUT/sinac/$file" "$BASE/all_data/secretaria_salud/77c166cc-bcbf-4b28-806e-f2a60c3de821/$file" 2>/dev/null
    echo "  Done: $(du -h "$OUT/sinac/$file" 2>/dev/null | cut -f1)"
done
# 2020-2023 have different naming
curl -skL --max-time 600 -o "$OUT/sinac/2020_sinac_2020.csv" "$BASE/all_data/secretaria_salud/77c166cc-bcbf-4b28-806e-f2a60c3de821/2020_sinac_2020.csv" 2>/dev/null
curl -skL --max-time 600 -o "$OUT/sinac/2021_Nacimientos_2021.csv" "$BASE/all_data/secretaria_salud/77c166cc-bcbf-4b28-806e-f2a60c3de821/2021_Nacimientos_2021.csv" 2>/dev/null
curl -skL --max-time 600 -o "$OUT/sinac/2022_Nacimientos_2022.csv" "$BASE/all_data/secretaria_salud/77c166cc-bcbf-4b28-806e-f2a60c3de821/2022_Nacimientos_2022.csv" 2>/dev/null
curl -skL --max-time 600 -o "$OUT/sinac/2023_Nacimientos_2023.csv" "$BASE/all_data/secretaria_salud/77c166cc-bcbf-4b28-806e-f2a60c3de821/2023_Nacimientos_2023.csv" 2>/dev/null

# --- Death Records (6.1 GB) ---
echo ""
echo ">>> Death Records (26 files, ~6.1 GB) <<<"
for year in $(seq 1998 2023); do
    file="defunciones_registradas_${year}.csv"
    echo "  Downloading $file..."
    curl -skL --max-time 600 -o "$OUT/defunciones/$file" "$BASE/all_data/secretaria_salud/6fecbbb3-afd9-44a1-8665-679a80ce4a15/$file" 2>/dev/null
    echo "  Done: $(du -h "$OUT/defunciones/$file" 2>/dev/null | cut -f1)"
done

# --- Education Centers with CURP/RFC (1.7 GB) ---
echo ""
echo ">>> Education Centers - CURP/RFC/PII (~1.7 GB) <<<"
echo "  Downloading combined national file..."
curl -skL --max-time 600 -o "$OUT/educacion/CATALOGO_CENTRO_TRABAJO_ALL.csv" "$BASE/all_data/secretaria_educacion/2a1d047c-546b-4293-971a-c835689a37a5/CATALOGO_CENTRO_TRABAJO_01_16_CSV.csv" 2>/dev/null
echo "  Done: $(du -h "$OUT/educacion/CATALOGO_CENTRO_TRABAJO_ALL.csv" 2>/dev/null | cut -f1)"

# --- INM Migration Records (257 MB) ---
echo ""
echo ">>> INM Migration Records (~257 MB) <<<"
curl -skL --max-time 300 -o "$OUT/inm/Tramites_Migratorios.csv" "$BASE/INM/regulacion_migratoria/Tramites_Migratorios.csv" 2>/dev/null
curl -skL --max-time 300 -o "$OUT/inm/Documentos_Migratorios.csv" "$BASE/INM/regulacion_migratoria/Documentos_Migratorios.csv" 2>/dev/null

# --- Irregular Migration (175 MB) ---
echo ""
echo ">>> Irregular Migration Events (~175 MB) <<<"
curl -skL --max-time 300 -o "$OUT/gobernacion/situ_irregular_2023.csv" "$BASE/all_data/secretaria_gobernacion/eventos_migratoria_irregular_2023/situ_irregular_2023.csv" 2>/dev/null

# --- SESNSP Crime Data (424 MB) ---
echo ""
echo ">>> SESNSP Crime Incidence (~424 MB) <<<"
curl -skL --max-time 300 -o "$OUT/sesnsp/IDM_NM_ene25.csv" "$BASE/SESNSP/incidencia_delictiva/IDM_NM_ene25.csv" 2>/dev/null
curl -skL --max-time 300 -o "$OUT/sesnsp/INM_estatal_ene.csv" "$BASE/SESNSP/incidencia_delictiva/INM_estatal_ene.csv" 2>/dev/null

# --- Compranet Procurement (907 MB) ---
echo ""
echo ">>> Compranet Procurement (~907 MB) <<<"
curl -skL --max-time 600 -o "$OUT/misc/compranet_historico.csv" "$BASE/compranet_historico.csv" 2>/dev/null

# --- CENSIDA HIV/AIDS (22 MB) ---
echo ""
echo ">>> CENSIDA HIV Treatment (~22 MB) <<<"
curl -skL --max-time 60 -o "$OUT/misc/arv_jul2023_dic2024.csv" "$BASE/CENSIDA/activas_con_tratamiento/Personas_Tratamiento_Antirretroviral_julio_2023_diciembre_2024.csv" 2>/dev/null
curl -skL --max-time 60 -o "$OUT/misc/arv_jun2022_jun2023.csv" "$BASE/CENSIDA/activas_con_tratamiento/Personas_Tratamiento_Antirretroviral_junio_2022_junio_2023.csv" 2>/dev/null

echo ""
echo "=== DOWNLOAD COMPLETE ==="
echo "=== $(date) ==="
du -sh "$OUT"
echo ""
echo "File counts:"
find "$OUT" -type f | wc -l
echo ""
echo "By directory:"
du -sh "$OUT"/*/
