"""
Analyze CSV Headers - Scans downloaded CSVs to identify which contain PII fields
Helps prioritize files for manual review
"""
import csv
from pathlib import Path
import sys

DOWNLOAD_DIR = Path("inm_downloads")

# PII indicator keywords (Spanish and English)
PII_KEYWORDS = [
    # Names
    "nombre", "name", "apellido", "surname", "apellidos",
    "nombre_completo", "full_name", "primer_nombre", "segundo_nombre",

    # IDs
    "curp", "rfc", "nss", "passport", "pasaporte", "visa",
    "numero_documento", "document_number", "id_", "identificacion",
    "cedula", "matricula", "folio", "expediente",

    # Contact
    "email", "correo", "telefono", "phone", "celular", "mobile",
    "direccion", "address", "domicilio", "calle", "colonia",
    "codigo_postal", "zip", "postal",

    # Personal details
    "fecha_nacimiento", "birth_date", "dob", "edad", "age",
    "sexo", "genero", "gender", "sex",
    "nacionalidad", "nationality", "pais_origen", "country",
    "estado_civil", "marital",

    # Biometric
    "huella", "fingerprint", "foto", "photo", "imagen", "image",
    "biometric", "facial",

    # Financial
    "cuenta", "account", "banco", "bank", "tarjeta", "card",
    "salario", "salary", "ingreso", "income",

    # Location
    "latitud", "longitud", "lat", "lon", "gps", "coordenadas",
    "ubicacion", "location",

    # Family
    "padre", "madre", "father", "mother", "hijo", "child",
    "familiar", "family", "parentesco", "relationship",
    "acompanante", "companion",
]

# Fields that indicate aggregated/safe data
SAFE_KEYWORDS = [
    "total", "count", "cantidad", "suma", "promedio", "average",
    "year", "ano", "mes", "month", "periodo", "period",
    "entidad", "entity", "estado", "state", "municipio",
    "categoria", "category", "tipo", "type", "concepto",
]


def analyze_file(filepath: Path):
    """Analyze a single CSV file for PII indicators."""
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            # Try to detect delimiter
            sample = f.read(4096)
            f.seek(0)

            if "\t" in sample:
                delimiter = "\t"
            elif ";" in sample:
                delimiter = ";"
            else:
                delimiter = ","

            reader = csv.reader(f, delimiter=delimiter)
            headers = next(reader, None)

            if not headers:
                return None

            # Clean headers
            headers = [h.strip().lower() for h in headers]

            # Count rows (sample)
            row_count = sum(1 for _ in reader)

    except Exception as e:
        return {"error": str(e)}

    # Check for PII
    pii_fields = []
    safe_fields = []

    for header in headers:
        header_lower = header.lower()

        for keyword in PII_KEYWORDS:
            if keyword in header_lower:
                pii_fields.append(header)
                break

        for keyword in SAFE_KEYWORDS:
            if keyword in header_lower:
                safe_fields.append(header)
                break

    return {
        "file": str(filepath),
        "headers": headers,
        "header_count": len(headers),
        "row_count": row_count,
        "pii_fields": pii_fields,
        "safe_fields": safe_fields,
        "pii_risk": "HIGH" if pii_fields else "LOW",
    }


def main():
    scan_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else DOWNLOAD_DIR

    if not scan_dir.exists():
        print(f"ERROR: Directory not found: {scan_dir}")
        sys.exit(1)

    csv_files = list(scan_dir.rglob("*.csv"))

    print("=" * 70)
    print(f"CSV Header Analyzer - PII Detection")
    print("=" * 70)
    print(f"Scanning: {scan_dir}")
    print(f"Found {len(csv_files)} CSV files")
    print("=" * 70)

    high_risk = []
    low_risk = []
    errors = []

    for filepath in csv_files:
        result = analyze_file(filepath)

        if result is None:
            continue

        if "error" in result:
            errors.append((filepath, result["error"]))
            continue

        if result["pii_risk"] == "HIGH":
            high_risk.append(result)
        else:
            low_risk.append(result)

    # Report HIGH risk files
    print("\n" + "=" * 70)
    print(f"HIGH RISK FILES (Potential PII) - {len(high_risk)} files")
    print("=" * 70)

    for r in sorted(high_risk, key=lambda x: len(x["pii_fields"]), reverse=True):
        print(f"\n[HIGH RISK] {r['file']}")
        print(f"  Rows: {r['row_count']:,}")
        print(f"  PII Fields: {', '.join(r['pii_fields'])}")
        print(f"  All Headers: {', '.join(r['headers'][:10])}", end="")
        if len(r['headers']) > 10:
            print(f" ... (+{len(r['headers'])-10} more)")
        else:
            print()

    # Summary of LOW risk
    print("\n" + "=" * 70)
    print(f"LOW RISK FILES (Aggregated Data) - {len(low_risk)} files")
    print("=" * 70)

    for r in low_risk:
        print(f"  {r['file']} ({r['row_count']:,} rows)")

    # Errors
    if errors:
        print("\n" + "=" * 70)
        print(f"ERRORS - {len(errors)} files")
        print("=" * 70)
        for filepath, error in errors:
            print(f"  {filepath}: {error}")

    # Save detailed report
    report_file = Path("csv_analysis_report.txt")
    with open(report_file, "w", encoding="utf-8") as f:
        f.write("CSV PII Analysis Report\n")
        f.write("=" * 70 + "\n\n")

        f.write(f"HIGH RISK: {len(high_risk)}\n")
        f.write(f"LOW RISK: {len(low_risk)}\n")
        f.write(f"ERRORS: {len(errors)}\n\n")

        f.write("HIGH RISK FILES:\n")
        for r in high_risk:
            f.write(f"\n{r['file']}\n")
            f.write(f"  PII Fields: {r['pii_fields']}\n")
            f.write(f"  All Headers: {r['headers']}\n")
            f.write(f"  Rows: {r['row_count']}\n")

    print(f"\nDetailed report saved to: {report_file}")


if __name__ == "__main__":
    main()