#!/usr/bin/env python3
"""
Colombian Government ArcGIS Server Discovery & Catalog Tool
Probes domains for ArcGIS REST endpoints, catalogs services found.
Also checks crt.sh for GIS-related subdomains.
"""

import urllib.request
import urllib.error
import urllib.parse
import json
import ssl
import os
import sys
import time
from datetime import datetime

# --- Configuration ---
BASE_DIR = r"C:\Users\Squir\Desktop\COLOMBIA\DUMP 2_25_2026"
TIMEOUT = 15
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

# SSL context that doesn't verify (many gov servers have cert issues)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# ArcGIS path patterns to try
ARCGIS_PATHS = [
    "/arcgis/rest/services",
    "/server/rest/services",
    "/gis/rest/services",
    "/rest/services",
    "/arcgis/rest/services/",
    "/server/rest/services/",
]

# Already found/dumped - skip these
ALREADY_FOUND = {
    "ergit.presidencia.gov.co",
    "gis.dnp.gov.co",
    "sig.upra.gov.co",
    "sig.minambiente.gov.co",
    "geocontraloria.contraloria.gov.co",
}

# Domains to probe
PROBE_DOMAINS = [
    # Parks
    "sig.parquesnacionales.gov.co",
    "gis.parquesnacionales.gov.co",
    # Fiscalia (Attorney General)
    "gis.fiscalia.gov.co",
    "sig.fiscalia.gov.co",
    # IGAC (Geographic Institute)
    "sig.igac.gov.co",
    "gis.igac.gov.co",
    "geocarto.igac.gov.co",
    "geoportal.igac.gov.co",
    # DANE (Statistics)
    "gis.dane.gov.co",
    "sig.dane.gov.co",
    "geoportal.dane.gov.co",
    # ANM (Mining)
    "gis.anm.gov.co",
    "sig.anm.gov.co",
    "anna.anm.gov.co",
    # ANI (Infrastructure)
    "sig.ani.gov.co",
    "gis.ani.gov.co",
    # IDEAM (Meteorology)
    "gis.ideam.gov.co",
    "sig.ideam.gov.co",
    # SIAC (Environmental Info)
    "gis.siac.gov.co",
    "sig.siac.gov.co",
    # MinAgricultura
    "gis.minagricultura.gov.co",
    "sig.minagricultura.gov.co",
    # CAR (Regional Environmental)
    "sig.car.gov.co",
    "gis.car.gov.co",
    # MinTransporte
    "gis.mintransporte.gov.co",
    "sig.mintransporte.gov.co",
    # SGC (Geological Survey)
    "geoservices.sgc.gov.co",
    "gis.sgc.gov.co",
    "sig.sgc.gov.co",
    "srvags.sgc.gov.co",
    # ANLA (Environmental Licensing)
    "sig.anla.gov.co",
    "gis.anla.gov.co",
    # ANH (Hydrocarbons)
    "sig.anh.gov.co",
    "gis.anh.gov.co",
    "geovisor.anh.gov.co",
    # Military
    "gis.ejercito.mil.co",
    "sig.ejercito.mil.co",
    "gis.armada.mil.co",
    "gis.fac.mil.co",
    # Additional agencies worth checking
    "gis.icde.gov.co",
    "geo.icde.gov.co",
    "geoserver.icde.gov.co",
    "sig.invias.gov.co",
    "gis.invias.gov.co",
    "sig.corpoboyaca.gov.co",
    "sig.cortolima.gov.co",
    "sig.corpoguajira.gov.co",
    "gis.cvc.gov.co",
    "sig.cvc.gov.co",
    "sig.cornare.gov.co",
    "gis.superservicios.gov.co",
    "sig.superservicios.gov.co",
    "gis.catastrobogota.gov.co",
    "sig.catastrobogota.gov.co",
    "gis.idiger.gov.co",
    "sig.idiger.gov.co",
    "mapas.igac.gov.co",
    "sigot.igac.gov.co",
    "gis.minenergia.gov.co",
    "sig.minenergia.gov.co",
    "gis.minvivienda.gov.co",
    "sig.minvivienda.gov.co",
    "gis.ica.gov.co",
    "sig.ica.gov.co",
    "gis.policia.gov.co",
    "sig.policia.gov.co",
    "gis.unidadvictimas.gov.co",
    "sig.unidadvictimas.gov.co",
    "gis.restituciondetierras.gov.co",
    "sig.restituciondetierras.gov.co",
    "gis.agenciadetierras.gov.co",
    "sig.agenciadetierras.gov.co",
]

# For crt.sh lookups
CRTSH_DOMAINS = [
    "parquesnacionales.gov.co",
    "fiscalia.gov.co",
    "igac.gov.co",
    "dane.gov.co",
    "anm.gov.co",
    "ani.gov.co",
    "ideam.gov.co",
    "siac.gov.co",
    "minagricultura.gov.co",
    "car.gov.co",
    "mintransporte.gov.co",
    "sgc.gov.co",
    "anla.gov.co",
    "anh.gov.co",
    "ejercito.mil.co",
    "armada.mil.co",
    "fac.mil.co",
    "invias.gov.co",
    "minenergia.gov.co",
    "policia.gov.co",
    "ica.gov.co",
    "unidadvictimas.gov.co",
    "restituciondetierras.gov.co",
    "agenciadetierras.gov.co",
    "catastrobogota.gov.co",
    "idiger.gov.co",
]


def fetch_url(url, timeout=TIMEOUT):
    """Fetch URL content, return (status_code, data_string) or (error_code, error_msg)."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        data = resp.read().decode("utf-8", errors="replace")
        return resp.status, data
    except urllib.error.HTTPError as e:
        body = ""
        try:
            body = e.read().decode("utf-8", errors="replace")[:500]
        except:
            pass
        return e.code, f"HTTPError {e.code}: {e.reason} | {body}"
    except urllib.error.URLError as e:
        return -1, f"URLError: {e.reason}"
    except Exception as e:
        return -2, f"Error: {type(e).__name__}: {e}"


def fetch_json(url, timeout=TIMEOUT):
    """Fetch URL expecting JSON, return parsed dict or None."""
    sep = "&" if "?" in url else "?"
    json_url = url + sep + "f=pjson"
    status, data = fetch_url(json_url, timeout)
    if status == 200:
        try:
            return json.loads(data)
        except json.JSONDecodeError:
            return None
    return None


def probe_arcgis(domain):
    """Try all ArcGIS path patterns on a domain, return list of (url, catalog_json) hits."""
    hits = []
    seen_paths = set()
    for path in ARCGIS_PATHS:
        for scheme in ["https", "http"]:
            url = f"{scheme}://{domain}{path}"
            print(f"  Probing: {url}", end=" ... ", flush=True)
            catalog = fetch_json(url)
            if catalog and ("services" in catalog or "folders" in catalog):
                # Normalize to avoid duplicates from trailing slash
                norm_path = path.rstrip("/")
                if norm_path not in seen_paths:
                    seen_paths.add(norm_path)
                    print(f"FOUND! ({len(catalog.get('services', []))} services, {len(catalog.get('folders', []))} folders)")
                    hits.append((url.rstrip("/"), catalog))
                else:
                    print("(duplicate of previous hit)")
            else:
                # Check if we got a redirect or HTML page that mentions ArcGIS
                status, data = fetch_url(url)
                if status == 200 and data and ("ArcGIS" in data or "arcgis" in data.lower() or "REST Services" in data):
                    norm_path = path.rstrip("/")
                    if norm_path not in seen_paths:
                        seen_paths.add(norm_path)
                        print(f"FOUND (HTML)! ArcGIS page detected")
                        hits.append((url.rstrip("/"), {"_html": True, "_snippet": data[:2000]}))
                    else:
                        print("(duplicate HTML)")
                else:
                    print(f"no ({status})")
        # If we found a hit on this path, no need to try more paths
        if hits:
            break
    return hits


def enumerate_folder(base_url, folder_name, timeout=TIMEOUT):
    """Get services in a specific folder."""
    url = f"{base_url}/{folder_name}"
    return fetch_json(url, timeout)


def get_service_info(base_url, service_name, service_type, timeout=TIMEOUT):
    """Get detailed info about a specific service."""
    url = f"{base_url}/{service_name}/{service_type}"
    return fetch_json(url, timeout)


def catalog_server(base_url, catalog_json):
    """Full catalog: enumerate folders, get service details."""
    result = {
        "base_url": base_url,
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "root_catalog": catalog_json,
        "folders": {},
        "service_details": {},
    }

    # Root-level services
    root_services = catalog_json.get("services", [])
    for svc in root_services:
        svc_name = svc.get("name", "")
        svc_type = svc.get("type", "")
        if svc_name and svc_type:
            print(f"    Service: {svc_name}/{svc_type}", end=" ... ", flush=True)
            info = get_service_info(base_url, svc_name, svc_type)
            if info:
                layers = info.get("layers", [])
                print(f"{len(layers)} layers")
                result["service_details"][f"{svc_name}/{svc_type}"] = {
                    "description": info.get("description", ""),
                    "serviceDescription": info.get("serviceDescription", ""),
                    "layers": [{"id": l.get("id"), "name": l.get("name"), "type": l.get("type", "")} for l in layers],
                    "tables": [{"id": t.get("id"), "name": t.get("name")} for t in info.get("tables", [])],
                    "capabilities": info.get("capabilities", ""),
                    "maxRecordCount": info.get("maxRecordCount", ""),
                    "supportedQueryFormats": info.get("supportedQueryFormats", ""),
                }
            else:
                print("(failed to get details)")
            time.sleep(0.3)

    # Folders
    folders = catalog_json.get("folders", [])
    for folder in folders:
        print(f"    Folder: {folder}", flush=True)
        folder_cat = enumerate_folder(base_url, folder)
        if folder_cat:
            folder_services = folder_cat.get("services", [])
            result["folders"][folder] = {
                "service_count": len(folder_services),
                "services": folder_services,
            }
            # Get details for each service in folder
            for svc in folder_services:
                svc_name = svc.get("name", "")
                svc_type = svc.get("type", "")
                if svc_name and svc_type:
                    print(f"      Service: {svc_name}/{svc_type}", end=" ... ", flush=True)
                    info = get_service_info(base_url, svc_name, svc_type)
                    if info:
                        layers = info.get("layers", [])
                        print(f"{len(layers)} layers")
                        result["service_details"][f"{svc_name}/{svc_type}"] = {
                            "description": info.get("description", ""),
                            "serviceDescription": info.get("serviceDescription", ""),
                            "layers": [{"id": l.get("id"), "name": l.get("name"), "type": l.get("type", "")} for l in layers],
                            "tables": [{"id": t.get("id"), "name": t.get("name")} for t in info.get("tables", [])],
                            "capabilities": info.get("capabilities", ""),
                            "maxRecordCount": info.get("maxRecordCount", ""),
                            "supportedQueryFormats": info.get("supportedQueryFormats", ""),
                        }
                    else:
                        print("(failed)")
                    time.sleep(0.3)
        else:
            result["folders"][folder] = {"error": "Failed to enumerate"}
        time.sleep(0.2)

    return result


def check_crtsh(domain):
    """Query crt.sh for subdomains, filter for GIS-related ones."""
    url = f"https://crt.sh/?q=%25.{domain}&output=json"
    print(f"  crt.sh: {domain}", end=" ... ", flush=True)
    status, data = fetch_url(url, timeout=30)
    if status != 200:
        print(f"failed ({status})")
        return []

    try:
        entries = json.loads(data)
    except json.JSONDecodeError:
        print("bad JSON")
        return []

    # Extract unique hostnames
    hostnames = set()
    for entry in entries:
        name = entry.get("common_name", "")
        if name:
            hostnames.add(name.lower().strip())
        # Also check name_value which can have multiple names
        nv = entry.get("name_value", "")
        for n in nv.split("\n"):
            n = n.strip().lower()
            if n:
                hostnames.add(n)

    # Filter for GIS-related subdomains
    gis_keywords = ["gis", "sig", "geo", "map", "arcgis", "geoportal", "geoserver",
                     "geoservicio", "cartografia", "carto", "wms", "wfs", "spatial",
                     "geovi", "mapas", "sigot", "geovisor", "geocontraloria",
                     "geobogota", "geoportal", "geodata"]

    gis_hosts = set()
    all_hosts = set()
    for h in hostnames:
        # Remove wildcard prefix
        h = h.lstrip("*.")
        all_hosts.add(h)
        for kw in gis_keywords:
            if kw in h:
                gis_hosts.add(h)
                break

    print(f"{len(all_hosts)} total subdomains, {len(gis_hosts)} GIS-related")
    return sorted(gis_hosts)


def save_json(filepath, data):
    """Save data as formatted JSON."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False, default=str)


def main():
    print("=" * 70)
    print("Colombian Government ArcGIS Server Discovery")
    print(f"Started: {datetime.now().isoformat()}")
    print("=" * 70)

    all_discoveries = {}
    crtsh_discoveries = {}
    extra_domains_from_crtsh = set()

    # --- Phase 1: crt.sh subdomain discovery ---
    print("\n" + "=" * 70)
    print("PHASE 1: crt.sh Subdomain Discovery")
    print("=" * 70)

    for domain in CRTSH_DOMAINS:
        gis_hosts = check_crtsh(domain)
        if gis_hosts:
            crtsh_discoveries[domain] = gis_hosts
            for h in gis_hosts:
                if h not in [d for d in PROBE_DOMAINS] and h not in ALREADY_FOUND:
                    extra_domains_from_crtsh.add(h)
        time.sleep(1)  # Be nice to crt.sh

    # Save crt.sh results
    save_json(os.path.join(BASE_DIR, "crtsh_gis_subdomains.json"), crtsh_discoveries)
    print(f"\n  crt.sh results saved. Extra domains to probe: {len(extra_domains_from_crtsh)}")
    if extra_domains_from_crtsh:
        print(f"  New domains from crt.sh: {sorted(extra_domains_from_crtsh)}")

    # --- Phase 2: Probe all domains for ArcGIS endpoints ---
    print("\n" + "=" * 70)
    print("PHASE 2: ArcGIS Endpoint Discovery")
    print("=" * 70)

    # Combine original probe list with crt.sh discoveries
    all_domains = list(PROBE_DOMAINS) + sorted(extra_domains_from_crtsh)
    # Deduplicate
    seen = set()
    unique_domains = []
    for d in all_domains:
        if d.lower() not in seen and d.lower() not in ALREADY_FOUND:
            seen.add(d.lower())
            unique_domains.append(d)

    for domain in unique_domains:
        print(f"\n[{domain}]")
        hits = probe_arcgis(domain)
        if hits:
            all_discoveries[domain] = hits

    # --- Phase 3: Catalog discovered servers ---
    print("\n" + "=" * 70)
    print("PHASE 3: Cataloging Discovered Servers")
    print("=" * 70)

    catalogs = {}
    for domain, hits in all_discoveries.items():
        for base_url, catalog_json in hits:
            if isinstance(catalog_json, dict) and "_html" in catalog_json:
                print(f"\n[{domain}] HTML-only ArcGIS page at {base_url} - skipping deep catalog")
                catalogs[domain] = {
                    "base_url": base_url,
                    "type": "html_detected",
                    "snippet": catalog_json.get("_snippet", ""),
                }
                continue

            print(f"\n[{domain}] Cataloging {base_url}")
            catalog = catalog_server(base_url, catalog_json)
            catalogs[domain] = catalog

            # Save individual catalog
            agency = domain.split(".")[0]
            if agency in ("gis", "sig", "geo", "geoservices"):
                agency = domain.split(".")[1] if len(domain.split(".")) > 2 else domain.split(".")[0]
            agency_dir = os.path.join(BASE_DIR, f"arcgis-{agency}")
            save_json(os.path.join(agency_dir, "catalog.json"), catalog)
            print(f"  Saved to arcgis-{agency}/catalog.json")

    # --- Phase 4: Summary ---
    print("\n" + "=" * 70)
    print("DISCOVERY SUMMARY")
    print("=" * 70)

    summary = {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "already_known": sorted(ALREADY_FOUND),
        "newly_discovered": {},
        "no_arcgis_found": [],
        "crtsh_gis_subdomains": crtsh_discoveries,
        "domains_probed": len(unique_domains),
    }

    for domain, hits in all_discoveries.items():
        for base_url, catalog_json in hits:
            if isinstance(catalog_json, dict) and "_html" in catalog_json:
                summary["newly_discovered"][domain] = {
                    "url": base_url,
                    "type": "html_page",
                    "services": 0,
                    "folders": 0,
                }
            else:
                svc_count = len(catalog_json.get("services", []))
                folder_count = len(catalog_json.get("folders", []))
                summary["newly_discovered"][domain] = {
                    "url": base_url,
                    "services": svc_count,
                    "folders": folder_count,
                }

    # Domains with no ArcGIS
    for d in unique_domains:
        if d not in all_discoveries:
            summary["no_arcgis_found"].append(d)

    save_json(os.path.join(BASE_DIR, "discovery_summary.json"), summary)

    # Print summary
    print(f"\nDomains probed: {len(unique_domains)}")
    print(f"Already known servers: {len(ALREADY_FOUND)}")
    print(f"NEW ArcGIS servers found: {len(all_discoveries)}")
    for domain, hits in all_discoveries.items():
        for url, cat in hits:
            if isinstance(cat, dict) and "_html" in cat:
                print(f"  {domain}: {url} (HTML page)")
            else:
                svcs = len(cat.get("services", []))
                flds = len(cat.get("folders", []))
                print(f"  {domain}: {url} ({svcs} services, {flds} folders)")

    print(f"\nDomains with NO ArcGIS: {len(summary['no_arcgis_found'])}")

    if crtsh_discoveries:
        print(f"\nGIS subdomains found via crt.sh:")
        for domain, hosts in crtsh_discoveries.items():
            print(f"  {domain}: {hosts}")

    print(f"\nAll results saved to: {BASE_DIR}")
    print(f"Finished: {datetime.now().isoformat()}")


if __name__ == "__main__":
    # Suppress SSL warnings
    import warnings
    warnings.filterwarnings("ignore")

    main()
