"""
Fast URL Checker - Uses HEAD requests and threading to quickly scan for valid URLs
Does NOT download files - just logs which URLs exist
"""
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import time

BASE_URL = "https://repodatos.atdt.gob.mx/api_update/inm/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

# Output files
VALID_URLS = Path("valid_urls.txt")
CHECKED_URLS = Path("checked_urls.txt")

KNOWN_DIRECTORIES = [
    "capacitacion_servidores_publicos_inm",
    "estadisticas_control_verificacion_migratoria",
    "estadisticas_regulacion_migratoria",
    "migrantes_atendidos_grupos_beta_proteccion_migrantes",
    "personas_extranjeras_victimas_delito",
    "programa_paisano",
    "programa_pesca_deportiva_turismo_nautico",
    "salida_menores_pais",
    "",  # base directory
]

COMMON_SUFFIXES = [
    "menores_viajan",
    "tramites_migratorios",
    "documentos_migratorios",
    "visitas_verificacion",
    "revisiones_migratorias",
    "atendidos_grupo_beta",
    "victimas_delito",
    "victimas_delitos",
    "peticiones_paisano",
    "atenciones_programa_paisanos",
    "quejas_paisano",
    "pesca_deportiva",
    "servidores_capacitados",
    "datos",
    "registros",
    "estadisticas",
    "reporte",
    "informe",
    "migrantes",
    "extranjeros",
    "deportados",
    "detenidos",
    "rechazados",
    "admitidos",
    "visas",
    "permisos",
    "residentes",
    "naturalizados",
    "refugiados",
    "asilo",
]

EXTENSIONS = [".csv", ".json", ".xlsx"]


def check_url(url):
    """Quick HEAD request to check if URL exists."""
    try:
        r = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True)
        return url, r.status_code == 200
    except:
        return url, False


def generate_urls(start_id=1, end_id=10000):
    """Generate all URLs to check."""
    urls = []

    for directory in KNOWN_DIRECTORIES:
        base = f"{BASE_URL}{directory}/" if directory else BASE_URL

        for file_id in range(start_id, end_id + 1):
            # ID + suffix patterns
            for suffix in COMMON_SUFFIXES:
                for ext in EXTENSIONS:
                    urls.append(f"{base}{file_id}_{suffix}{ext}")

            # Just ID patterns
            for ext in EXTENSIONS:
                urls.append(f"{base}{file_id}{ext}")

    return urls


def main():
    print("=" * 70)
    print("Fast URL Checker - INM Repository")
    print("=" * 70)
    print(f"Using {len(KNOWN_DIRECTORIES)} directories")
    print(f"Using {len(COMMON_SUFFIXES)} suffixes")
    print("Generating URLs...")

    urls = generate_urls(1, 10000)
    print(f"Total URLs to check: {len(urls):,}")
    print("=" * 70)

    valid_count = 0
    checked_count = 0

    # Clear output files
    VALID_URLS.write_text("")
    CHECKED_URLS.write_text("")

    # Use threading for speed
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = {executor.submit(check_url, url): url for url in urls}

        for future in as_completed(futures):
            url, exists = future.result()
            checked_count += 1

            with open(CHECKED_URLS, "a") as f:
                f.write(f"{url}\n")

            if exists:
                valid_count += 1
                print(f"[VALID] {url}")
                with open(VALID_URLS, "a") as f:
                    f.write(f"{url}\n")

            if checked_count % 1000 == 0:
                print(f"[PROGRESS] Checked {checked_count:,} / {len(urls):,} URLs - Found {valid_count} valid")

    print("\n" + "=" * 70)
    print(f"COMPLETE: Checked {checked_count:,} URLs")
    print(f"Found {valid_count} valid URLs")
    print(f"Valid URLs saved to: {VALID_URLS}")
    print("=" * 70)


if __name__ == "__main__":
    main()
