"""
INM Parallel Downloader - 30 concurrent connections for fast scanning
"""
import requests
from pathlib import Path
import time
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Base configuration
BASE_URL = "https://repodatos.atdt.gob.mx/api_update/inm/"
DOWNLOAD_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look\inm_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Log files
LOG_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look")
HITS_LOG = LOG_DIR / "hits.txt"
MISSES_LOG = LOG_DIR / "misses.txt"
ALL_URLS_LOG = LOG_DIR / "all_attempted_urls.txt"

# Thread lock for file writing
log_lock = threading.Lock()

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Parallel settings
MAX_WORKERS = 30

# Known directories
KNOWN_DIRECTORIES = [
    "capacitacion_servidores_publicos_inm",
    "estadisticas_control_verificacion_migratoria",
    "estadisticas_regulacion_migratoria",
    "migrantes_atendidos_grupos_beta_proteccion_migrantes",
    "personas_extranjeras_victimas_delito",
    "programa_paisano",
    "programa_pesca_deportiva_turismo_nautico",
    "salida_menores_pais",
    "",  # base directory
]

# Common suffixes
COMMON_SUFFIXES = [
    "menores_viajan",
    "tramites_migratorios",
    "documentos_migratorios",
    "visitas_verificacion",
    "revisiones_migratorias",
    "atendidos_grupo_beta",
    "victimas_delito",
    "victimas_delitos",
    "peticiones_paisano",
    "atenciones_programa_paisanos",
    "quejas_paisano",
    "pesca_deportiva",
    "servidores_capacitados",
    "datos",
    "registros",
    "estadisticas",
    "reporte",
    "informe",
    "migrantes",
    "extranjeros",
    "deportados",
    "detenidos",
    "rechazados",
    "residentes",
    "naturalizados",
    "refugiados",
]

# Priority extensions
EXTENSIONS = [".csv", ".json", ".xlsx", ".xls", ".pdf", ".xml", ".zip", ".doc", ".docx", ".txt"]

# Stats
stats = {"hits": 0, "misses": 0, "attempted": 0}
stats_lock = threading.Lock()


def log_hit(url, local_path):
    with log_lock:
        with open(HITS_LOG, "a", encoding="utf-8") as f:
            f.write(f"{url} -> {local_path}\n")


def log_miss(url):
    with log_lock:
        with open(MISSES_LOG, "a", encoding="utf-8") as f:
            f.write(f"{url}\n")


def log_attempt(url):
    with log_lock:
        with open(ALL_URLS_LOG, "a", encoding="utf-8") as f:
            f.write(f"{url}\n")


def download_file(url: str, local_path: Path):
    """Download a file - returns True if successful."""
    log_attempt(url)

    with stats_lock:
        stats["attempted"] += 1

    try:
        with requests.get(url, headers=HEADERS, stream=True, timeout=15) as r:
            if r.status_code == 200:
                local_path.parent.mkdir(parents=True, exist_ok=True)
                with open(local_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                log_hit(url, local_path)
                with stats_lock:
                    stats["hits"] += 1
                return True, url, local_path
            else:
                log_miss(url)
                with stats_lock:
                    stats["misses"] += 1
                return False, url, None
    except Exception as e:
        log_miss(url)
        with stats_lock:
            stats["misses"] += 1
        return False, url, None


def generate_urls_for_id(file_id: int, directory: str):
    """Generate all URL variants for a single ID in a directory."""
    urls = []
    base = f"{BASE_URL}{directory}/" if directory else BASE_URL

    # ID + suffix patterns
    for suffix in COMMON_SUFFIXES:
        for ext in EXTENSIONS:
            filename = f"{file_id}_{suffix}{ext}"
            url = f"{base}{filename}"
            local_path = DOWNLOAD_DIR / directory / filename if directory else DOWNLOAD_DIR / filename
            urls.append((url, local_path))

    # Just ID patterns
    for ext in EXTENSIONS:
        filename = f"{file_id}{ext}"
        url = f"{base}{filename}"
        local_path = DOWNLOAD_DIR / directory / filename if directory else DOWNLOAD_DIR / filename
        urls.append((url, local_path))

    return urls


def process_directory_json(current_path: str = ""):
    """Download all listed files via JSON API."""
    url = urllib.parse.urljoin(BASE_URL, current_path.rstrip("/") + "/") if current_path else BASE_URL
    print(f"[EXPLORING] {url}")

    try:
        r = requests.get(url, headers=HEADERS, timeout=20)
        r.raise_for_status()
        data = r.json()
    except Exception as e:
        print(f"[ERROR] Could not fetch {url}: {e}")
        return []

    found_files = []

    if not isinstance(data, list):
        return found_files

    for item in data:
        name = item.get("name")
        if not name:
            continue

        item_type = item.get("type", "unknown")

        if item_type == "directory":
            sub_path = f"{current_path.rstrip('/')}/{name}/" if current_path else f"{name}/"
            found_files.extend(process_directory_json(sub_path))

        elif item_type in ("file", "unknown"):
            file_url = urllib.parse.urljoin(BASE_URL, f"{current_path.rstrip('/')}/{name}")
            local_path = DOWNLOAD_DIR / current_path / name
            success, _, _ = download_file(file_url, local_path)
            if success:
                print(f"[DOWNLOADED] {name}")
                found_files.append(file_url)

    return found_files


def parallel_scan(start_id: int, end_id: int):
    """Scan IDs in parallel across all directories."""
    print(f"\n[PARALLEL SCAN] IDs {start_id}-{end_id} with {MAX_WORKERS} workers")

    all_hits = []

    # Generate all URLs to scan
    all_urls = []
    for directory in KNOWN_DIRECTORIES:
        for file_id in range(start_id, end_id + 1):
            all_urls.extend(generate_urls_for_id(file_id, directory))

    total_urls = len(all_urls)
    print(f"[TOTAL URLS TO SCAN] {total_urls:,}")

    processed = 0
    last_report = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(download_file, url, path): (url, path) for url, path in all_urls}

        for future in as_completed(futures):
            success, url, local_path = future.result()
            processed += 1

            if success:
                print(f"[HIT] {url}")
                all_hits.append(url)

            # Progress report every 1000
            if processed - last_report >= 1000:
                last_report = processed
                pct = (processed / total_urls) * 100
                print(f"[PROGRESS] {processed:,}/{total_urls:,} ({pct:.1f}%) - Hits: {stats['hits']}")

    return all_hits


def scan_hidden_directories():
    """Try to find unlisted directories."""
    print("\n[SCANNING] Hidden directories...")

    potential_dirs = [
        "datos", "data", "archivos", "files", "documentos", "documents",
        "reportes", "reports", "estadisticas", "historico", "backup",
        "old", "archive", "temp", "test", "dev", "api", "v1", "v2",
        "2024", "2025", "2026", "privado", "interno", "confidencial",
        "extranjeros", "migrantes", "visas", "permisos", "tramites",
        "solicitudes", "deportaciones", "detenciones", "fronteras",
    ]

    found = []

    def check_dir(dirname):
        url = f"{BASE_URL}{dirname}/"
        try:
            r = requests.get(url, headers=HEADERS, timeout=10)
            if r.status_code == 200:
                return dirname
        except:
            pass
        return None

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(check_dir, d): d for d in potential_dirs}
        for future in as_completed(futures):
            result = future.result()
            if result:
                print(f"[HIDDEN DIR] {result}")
                found.append(result)

    return found


def main():
    print("=" * 70)
    print("INM PARALLEL DOWNLOADER - 30 CONCURRENT CONNECTIONS")
    print("=" * 70)
    print(f"Base URL: {BASE_URL}")
    print(f"Download dir: {DOWNLOAD_DIR}")
    print(f"Workers: {MAX_WORKERS}")
    print("=" * 70)

    # Phase 1: Listed files
    print("\n[PHASE 1] Downloading listed files...")
    listed = process_directory_json()
    print(f"[PHASE 1 COMPLETE] {len(listed)} files")

    # Phase 2: Hidden directories
    print("\n[PHASE 2] Scanning hidden directories...")
    hidden = scan_hidden_directories()
    for hdir in hidden:
        process_directory_json(hdir)

    # Phase 3: Predictive parallel scan
    print("\n[PHASE 3] Predictive ID scan (1-10000)...")
    hits = parallel_scan(1, 10000)

    # Summary
    print("\n" + "=" * 70)
    print("SCAN COMPLETE")
    print("=" * 70)
    print(f"Total hits: {stats['hits']}")
    print(f"Total attempted: {stats['attempted']:,}")
    print(f"Files saved in: {DOWNLOAD_DIR}")
    print(f"Hit log: {HITS_LOG}")


if __name__ == "__main__":
    main()
