"""
Smart Parallel Scanner - Focuses on likely ID ranges and uses HEAD requests first
"""
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import sys

print = lambda *args, **kwargs: (sys.stdout.write(' '.join(map(str, args)) + '\n'), sys.stdout.flush())

BASE_URL = "https://repodatos.atdt.gob.mx/api_update/inm/"
DOWNLOAD_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look\inm_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)
HITS_LOG = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look\hits.txt")

lock = threading.Lock()
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

# Known file IDs from research: 1-12, 56, 126-141, 278, 286, 345-348, 534
# Expand to check ranges around these
ID_RANGES = [
    (1, 20),       # Low numbered files
    (50, 70),      # Around 56
    (120, 150),    # Around 126-141
    (270, 300),    # Around 278, 286
    (340, 360),    # Around 345-348
    (520, 550),    # Around 534
    (600, 700),    # Extend search
    (700, 800),
    (800, 900),
    (900, 1000),
    (1000, 1500),
    (1500, 2000),
    (2000, 3000),
    (3000, 5000),
    (5000, 10000),
]

DIRS = [
    ("capacitacion_servidores_publicos_inm", ["servidores_capacitados"]),
    ("estadisticas_control_verificacion_migratoria", ["visitas_verificacion", "revisiones_migratorias"]),
    ("estadisticas_regulacion_migratoria", ["documentos_migratorios", "tramites_migratorios"]),
    ("migrantes_atendidos_grupos_beta_proteccion_migrantes", ["atendidos_grupo_beta"]),
    ("personas_extranjeras_victimas_delito", ["victimas_delito", "victimas_delitos"]),
    ("programa_paisano", ["peticiones_paisano", "atenciones_programa_paisanos", "quejas_paisano"]),
    ("programa_pesca_deportiva_turismo_nautico", ["pesca_deportiva"]),
    ("salida_menores_pais", ["menores_viajan"]),
    ("", ["datos", "registros", "estadisticas", "reporte", "informe", "migrantes"]),  # base dir
]

EXTS = [".csv", ".json", ".xlsx", ".pdf", ".xml", ".zip", ".doc", ".docx", ".txt", ".xls"]

hits = 0
tried = 0


def check_and_download(args):
    global hits, tried
    url, path = args

    try:
        # Quick HEAD check first
        r = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True)
        if r.status_code == 200:
            # Actually download
            r = requests.get(url, headers=HEADERS, timeout=30)
            if r.status_code == 200 and len(r.content) > 0:
                path.parent.mkdir(parents=True, exist_ok=True)
                path.write_bytes(r.content)
                with lock:
                    hits += 1
                    with open(HITS_LOG, "a") as f:
                        f.write(f"{url} -> {path}\n")
                return True, url, len(r.content)
    except:
        pass

    with lock:
        tried += 1
    return False, url, 0


def generate_urls(start_id, end_id):
    urls = []
    for fid in range(start_id, end_id + 1):
        for dirname, suffixes in DIRS:
            base = f"{BASE_URL}{dirname}/" if dirname else BASE_URL
            dp = dirname if dirname else ""

            for s in suffixes:
                for e in EXTS:
                    fn = f"{fid}_{s}{e}"
                    urls.append((f"{base}{fn}", DOWNLOAD_DIR / dp / fn))

            # Also just the ID
            for e in EXTS:
                fn = f"{fid}{e}"
                urls.append((f"{base}{fn}", DOWNLOAD_DIR / dp / fn))

    return urls


print("=" * 60)
print("SMART PARALLEL SCANNER - 30 WORKERS")
print("=" * 60)
print(f"Focused ID ranges: {len(ID_RANGES)}")
print("Using HEAD checks before downloading")
print("=" * 60)

for start_id, end_id in ID_RANGES:
    urls = generate_urls(start_id, end_id)
    print(f"\n[RANGE] IDs {start_id}-{end_id} ({len(urls)} URLs)")

    with ThreadPoolExecutor(max_workers=30) as ex:
        futures = [ex.submit(check_and_download, u) for u in urls]
        for f in as_completed(futures):
            success, url, size = f.result()
            if success:
                print(f"[HIT] {url} ({size} bytes)")

    print(f"[STATUS] Checked: {tried + hits}, Hits: {hits}")

print("\n" + "=" * 60)
print(f"COMPLETE - Total hits: {hits}")
print(f"Files saved in: {DOWNLOAD_DIR}")
print("=" * 60)
