import requests
import json
from pathlib import Path
import time
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base configuration
BASE_URL = "https://repodatos.atdt.gob.mx/api_update/inm/"
DOWNLOAD_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look\inm_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Log files - in main folder
LOG_DIR = Path(r"C:\Users\Squir\Desktop\Mexico- Second Look")
HITS_LOG = LOG_DIR / "hits.txt"
MISSES_LOG = LOG_DIR / "misses.txt"
ALL_URLS_LOG = LOG_DIR / "all_attempted_urls.txt"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Known directories from research
KNOWN_DIRECTORIES = [
    "capacitacion_servidores_publicos_inm",
    "estadisticas_control_verificacion_migratoria",
    "estadisticas_regulacion_migratoria",
    "migrantes_atendidos_grupos_beta_proteccion_migrantes",
    "personas_extranjeras_victimas_delito",
    "programa_paisano",
    "programa_pesca_deportiva_turismo_nautico",
    "salida_menores_pais",
]

# Known file patterns from research (for predictive guessing)
KNOWN_FILE_PREFIXES = [
    "01_servidores_capacitados",
    "02_visitas_verificacion",
    "03_revisiones_migratorias",
    "04_documentos_migratorios",
    "05_",  # gap - try to find
    "06_atendidos_grupo_beta",
    "07_victimas_delitos",
    "08_peticiones_paisano",
    "09_atenciones_programa_paisanos",
    "10_",  # gap
    "11_quejas_paisano",
    "12_pesca_deportiva",
]

# File extensions to try - comprehensive list
EXTENSIONS = [
    # Data files
    ".csv", ".json", ".xlsx", ".xls", ".xml", ".txt", ".dat", ".sql", ".db", ".sqlite",
    # Documents
    ".pdf", ".doc", ".docx", ".odt", ".rtf",
    # Spreadsheets
    ".ods", ".xlsm", ".xlsb",
    # Presentations
    ".ppt", ".pptx", ".odp",
    # Archives
    ".zip", ".rar", ".7z", ".tar", ".gz", ".tar.gz", ".tgz",
    # Images
    ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".svg",
    # Web
    ".html", ".htm", ".php", ".asp", ".aspx", ".jsp",
    # Config/Other
    ".yml", ".yaml", ".ini", ".cfg", ".conf", ".log", ".bak", ".backup",
]

# Date ranges for predictive URLs
YEARS = range(2010, 2027)
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]


def log_hit(url, local_path):
    with open(HITS_LOG, "a", encoding="utf-8") as f:
        f.write(f"{url} -> {local_path}\n")


def log_miss(url):
    with open(MISSES_LOG, "a", encoding="utf-8") as f:
        f.write(f"{url}\n")


def log_attempt(url):
    with open(ALL_URLS_LOG, "a", encoding="utf-8") as f:
        f.write(f"{url}\n")


def download_file(url: str, local_path: Path, silent=False):
    """Download a file with progress feedback."""
    log_attempt(url)

    try:
        with requests.get(url, headers=HEADERS, stream=True, timeout=30) as r:
            if r.status_code == 200:
                local_path.parent.mkdir(parents=True, exist_ok=True)
                with open(local_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                if not silent:
                    print(f"[HIT] {url}")
                    print(f"      -> Saved: {local_path}")
                log_hit(url, local_path)
                return True
            else:
                if not silent:
                    pass  # Don't spam misses
                log_miss(url)
                return False
    except Exception as e:
        log_miss(url)
        return False


def check_url_exists(url: str):
    """Quick HEAD request to check if URL exists."""
    try:
        r = requests.head(url, headers=HEADERS, timeout=10, allow_redirects=True)
        return r.status_code == 200
    except:
        return False


def process_directory_json(current_path: str = ""):
    """Process directory via JSON API response."""
    url = urllib.parse.urljoin(BASE_URL, current_path.rstrip("/") + "/") if current_path else BASE_URL
    print(f"\n[EXPLORING] {url}")

    try:
        r = requests.get(url, headers=HEADERS, timeout=20)
        r.raise_for_status()
        data = r.json()
    except Exception as e:
        print(f"[ERROR] Could not fetch {url}: {e}")
        return []

    found_files = []

    if not isinstance(data, list):
        print(f"[WARN] Unexpected response format at {url}")
        return found_files

    for item in data:
        name = item.get("name")
        if not name:
            continue

        item_type = item.get("type", "unknown")

        if item_type == "directory":
            sub_path = f"{current_path.rstrip('/')}/{name}/" if current_path else f"{name}/"
            found_files.extend(process_directory_json(sub_path))

        elif item_type in ("file", "unknown"):
            file_url = urllib.parse.urljoin(BASE_URL, f"{current_path.rstrip('/')}/{name}")
            local_path = DOWNLOAD_DIR / current_path / name
            if download_file(file_url, local_path):
                found_files.append(file_url)
            time.sleep(0.5)

    return found_files


def predictive_number_scan(directory: str, prefix: str, start: int, end: int, extensions=None):
    """Scan for numbered files like 001_file.csv, 002_file.csv, etc."""
    if extensions is None:
        extensions = EXTENSIONS

    hits = []
    print(f"\n[PREDICTIVE] Scanning {directory} for {prefix}[{start}-{end}]...")

    for num in range(start, end + 1):
        for ext in extensions:
            # Try different number formats
            for fmt in [f"{num:01d}", f"{num:02d}", f"{num:03d}"]:
                filename = f"{fmt}_{prefix}{ext}" if prefix else f"{fmt}{ext}"
                url = f"{BASE_URL}{directory}/{filename}"
                local_path = DOWNLOAD_DIR / directory / filename

                if download_file(url, local_path, silent=True):
                    print(f"[PREDICTIVE HIT] {url}")
                    hits.append(url)

                time.sleep(0.3)

    return hits


def predictive_date_scan(directory: str, base_name: str):
    """Scan for date-based files like file_2015-2025.csv, file_072022-062025.csv"""
    hits = []
    print(f"\n[PREDICTIVE] Scanning {directory} for date-based {base_name}...")

    # Try year ranges
    for start_year in YEARS:
        for end_year in range(start_year, 2027):
            for ext in EXTENSIONS:
                # Format: name_2015-2025.csv
                filename = f"{base_name}_{start_year}-{end_year}{ext}"
                url = f"{BASE_URL}{directory}/{filename}"
                local_path = DOWNLOAD_DIR / directory / filename

                if download_file(url, local_path, silent=True):
                    print(f"[PREDICTIVE HIT] {url}")
                    hits.append(url)

                # Format: name_2015-062025.csv (with month)
                for month in MONTHS:
                    filename = f"{base_name}_{start_year}-{month}{end_year}{ext}"
                    url = f"{BASE_URL}{directory}/{filename}"
                    local_path = DOWNLOAD_DIR / directory / filename

                    if download_file(url, local_path, silent=True):
                        print(f"[PREDICTIVE HIT] {url}")
                        hits.append(url)

                time.sleep(0.2)

    return hits


def predictive_id_scan(directory: str, start_id: int = 1, end_id: int = 10000):
    """Scan for ID-prefixed files like 534_menores_viajan.csv, 141_tramites.csv"""
    hits = []
    print(f"\n[PREDICTIVE] Scanning {directory} for ID-prefixed files [{start_id}-{end_id}]...")

    # Common suffixes found in INM data
    common_suffixes = [
        "menores_viajan",
        "tramites_migratorios",
        "documentos_migratorios",
        "visitas_verificacion",
        "revisiones_migratorias",
        "atendidos_grupo_beta",
        "victimas_delito",
        "victimas_delitos",
        "peticiones_paisano",
        "atenciones_programa_paisanos",
        "quejas_paisano",
        "pesca_deportiva",
        "servidores_capacitados",
        # Generic
        "datos",
        "registros",
        "estadisticas",
        "reporte",
        "informe",
    ]

    # Priority extensions for ID scan (most common first)
    priority_ext = [".csv", ".json", ".xlsx", ".xls", ".pdf", ".xml", ".zip", ".doc", ".docx", ".txt"]

    for file_id in range(start_id, end_id + 1):
        for suffix in common_suffixes:
            for ext in priority_ext:
                filename = f"{file_id}_{suffix}{ext}"
                url = f"{BASE_URL}{directory}/{filename}"
                local_path = DOWNLOAD_DIR / directory / filename

                if download_file(url, local_path, silent=True):
                    print(f"[PREDICTIVE HIT] {url}")
                    hits.append(url)

        # Also try just the ID number
        for ext in priority_ext:
            filename = f"{file_id}{ext}"
            url = f"{BASE_URL}{directory}/{filename}"
            local_path = DOWNLOAD_DIR / directory / filename

            if download_file(url, local_path, silent=True):
                print(f"[PREDICTIVE HIT] {url}")
                hits.append(url)

        if file_id % 50 == 0:
            print(f"[PROGRESS] Scanned IDs up to {file_id}...")

        time.sleep(0.2)

    return hits


def scan_hidden_directories():
    """Try to find unlisted directories."""
    print("\n[PREDICTIVE] Scanning for hidden/unlisted directories...")

    # Common government directory naming patterns
    potential_dirs = [
        "datos",
        "data",
        "archivos",
        "files",
        "documentos",
        "documents",
        "reportes",
        "reports",
        "estadisticas",
        "statistics",
        "historico",
        "historical",
        "backup",
        "old",
        "archive",
        "temp",
        "test",
        "dev",
        "api",
        "v1",
        "v2",
        "2024",
        "2025",
        "2026",
        "enero",
        "febrero",
        "marzo",
        "privado",
        "interno",
        "confidencial",
        "extranjeros",
        "migrantes",
        "visas",
        "permisos",
        "tramites",
        "solicitudes",
        "resoluciones",
        "deportaciones",
        "detenciones",
        "estaciones_migratorias",
        "puntos_revision",
        "fronteras",
        "aeropuertos",
        "puertos",
    ]

    found_dirs = []

    for dirname in potential_dirs:
        url = f"{BASE_URL}{dirname}/"
        try:
            r = requests.get(url, headers=HEADERS, timeout=10)
            if r.status_code == 200:
                print(f"[HIDDEN DIR FOUND] {url}")
                found_dirs.append(dirname)
                log_hit(url, f"DIRECTORY: {dirname}")
        except:
            pass
        time.sleep(0.3)

    return found_dirs


def main():
    print("=" * 70)
    print("INM Data Repository - Comprehensive Download & Predictive Scanner")
    print("=" * 70)
    print(f"Base URL: {BASE_URL}")
    print(f"Download directory: {DOWNLOAD_DIR.resolve()}")
    print("=" * 70)

    all_hits = []

    # Phase 1: Download all listed files via JSON API
    print("\n" + "=" * 70)
    print("PHASE 1: Downloading all listed files via directory API")
    print("=" * 70)
    listed_files = process_directory_json()
    all_hits.extend(listed_files)
    print(f"\n[PHASE 1 COMPLETE] Downloaded {len(listed_files)} listed files")

    # Phase 2: Scan for hidden directories
    print("\n" + "=" * 70)
    print("PHASE 2: Scanning for hidden/unlisted directories")
    print("=" * 70)
    hidden_dirs = scan_hidden_directories()

    # Process any hidden directories found
    for hdir in hidden_dirs:
        print(f"\n[PROCESSING HIDDEN DIR] {hdir}")
        hidden_files = process_directory_json(hdir)
        all_hits.extend(hidden_files)

    # Phase 3: Predictive ID scanning on each known directory
    print("\n" + "=" * 70)
    print("PHASE 3: Predictive ID-based file scanning")
    print("=" * 70)

    for directory in KNOWN_DIRECTORIES:
        id_hits = predictive_id_scan(directory, start_id=1, end_id=10000)
        all_hits.extend(id_hits)

    # Also scan base directory
    id_hits = predictive_id_scan("", start_id=1, end_id=10000)
    all_hits.extend(id_hits)

    # Summary
    print("\n" + "=" * 70)
    print("SCAN COMPLETE - SUMMARY")
    print("=" * 70)
    print(f"Total files downloaded: {len(all_hits)}")
    print(f"Check '{HITS_LOG}' for all successful downloads")
    print(f"Check '{MISSES_LOG}' for failed attempts")
    print(f"Check '{ALL_URLS_LOG}' for all attempted URLs")
    print(f"Files saved in: {DOWNLOAD_DIR.resolve()}")


if __name__ == "__main__":
    main()
