"""
Fast Parallel Scanner - Streams URLs and downloads in real-time with 30 workers
"""
import requests
from pathlib import Path
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import sys

# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)

# Config
BASE_URL = "https://repodatos.atdt.gob.mx/api_update/inm/"
DOWNLOAD_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look\inm_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

LOG_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look")
HITS_LOG = LOG_DIR / "hits.txt"

log_lock = threading.Lock()
stats = {"hits": 0, "tried": 0}
stats_lock = threading.Lock()

MAX_WORKERS = 30

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

DIRECTORIES = [
    "capacitacion_servidores_publicos_inm",
    "estadisticas_control_verificacion_migratoria",
    "estadisticas_regulacion_migratoria",
    "migrantes_atendidos_grupos_beta_proteccion_migrantes",
    "personas_extranjeras_victimas_delito",
    "programa_paisano",
    "programa_pesca_deportiva_turismo_nautico",
    "salida_menores_pais",
    "",
]

SUFFIXES = [
    "menores_viajan", "tramites_migratorios", "documentos_migratorios",
    "visitas_verificacion", "revisiones_migratorias", "atendidos_grupo_beta",
    "victimas_delito", "victimas_delitos", "peticiones_paisano",
    "atenciones_programa_paisanos", "quejas_paisano", "pesca_deportiva",
    "servidores_capacitados", "datos", "registros", "estadisticas",
    "reporte", "informe", "migrantes", "extranjeros",
]

EXTENSIONS = [".csv", ".json", ".xlsx", ".pdf", ".xml", ".zip", ".doc", ".docx", ".txt", ".xls"]


def try_download(url, local_path):
    """Try to download a URL. Returns (success, url, path)."""
    with stats_lock:
        stats["tried"] += 1

    try:
        r = requests.get(url, headers=HEADERS, stream=True, timeout=10)
        if r.status_code == 200:
            local_path.parent.mkdir(parents=True, exist_ok=True)
            with open(local_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

            with log_lock:
                with open(HITS_LOG, "a") as f:
                    f.write(f"{url} -> {local_path}\n")

            with stats_lock:
                stats["hits"] += 1

            return True, url, local_path
    except:
        pass

    return False, url, None


def url_generator(start_id, end_id):
    """Generate URLs one at a time."""
    for file_id in range(start_id, end_id + 1):
        for directory in DIRECTORIES:
            base = f"{BASE_URL}{directory}/" if directory else BASE_URL
            dir_path = directory if directory else ""

            # ID + suffix patterns
            for suffix in SUFFIXES:
                for ext in EXTENSIONS:
                    filename = f"{file_id}_{suffix}{ext}"
                    url = f"{base}{filename}"
                    local_path = DOWNLOAD_DIR / dir_path / filename
                    yield url, local_path

            # Just ID
            for ext in EXTENSIONS:
                filename = f"{file_id}{ext}"
                url = f"{base}{filename}"
                local_path = DOWNLOAD_DIR / dir_path / filename
                yield url, local_path


def main():
    print("=" * 60)
    print("FAST PARALLEL SCANNER - 30 WORKERS")
    print("=" * 60)
    print(f"Scanning IDs 1-10000")
    print(f"Directories: {len(DIRECTORIES)}")
    print(f"Suffixes: {len(SUFFIXES)}")
    print(f"Extensions: {len(EXTENSIONS)}")

    # Calculate total
    urls_per_id = len(DIRECTORIES) * (len(SUFFIXES) * len(EXTENSIONS) + len(EXTENSIONS))
    total = urls_per_id * 10000
    print(f"Total URLs to try: {total:,}")
    print("=" * 60)
    print("Starting scan...\n")

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {}
        batch_size = 1000
        url_gen = url_generator(1, 10000)

        # Submit initial batch
        for _ in range(batch_size):
            try:
                url, path = next(url_gen)
                futures[executor.submit(try_download, url, path)] = url
            except StopIteration:
                break

        while futures:
            # Wait for any to complete
            done, _ = futures.keys(), None
            for future in as_completed(list(futures.keys())[:100]):
                success, url, path = future.result()
                del futures[future]

                if success:
                    print(f"[HIT] {url}")

                # Submit new URL
                try:
                    new_url, new_path = next(url_gen)
                    futures[executor.submit(try_download, new_url, new_path)] = new_url
                except StopIteration:
                    pass

                # Progress every 5000
                if stats["tried"] % 5000 == 0:
                    pct = (stats["tried"] / total) * 100
                    print(f"[PROGRESS] {stats['tried']:,}/{total:,} ({pct:.1f}%) - Hits: {stats['hits']}")

    print("\n" + "=" * 60)
    print(f"COMPLETE: {stats['tried']:,} tried, {stats['hits']} hits")
    print(f"Files in: {DOWNLOAD_DIR}")
    print("=" * 60)


if __name__ == "__main__":
    main()
