#!/usr/bin/env python3
"""
==========================================================================
 HAITI GOV PII MASTER EXTRACTOR
 Scans all JSON dumps across 10 government sites for personally
 identifiable information. Authorized OSINT assessment — all data
 obtained from public unauthenticated endpoints.

 Output: PII-MASTER-REPORT.txt
==========================================================================
"""

import json, re, os, sys, glob, html
from collections import defaultdict
from datetime import datetime

# Force UTF-8 output on Windows
if sys.platform == 'win32':
    sys.stdout.reconfigure(encoding='utf-8')
    sys.stderr.reconfigure(encoding='utf-8')

# ========================================================================
# CONFIGURATION
# ========================================================================
BASE_DIR = r"C:\Users\Squir\Desktop\HAITI\DUMP"
OUTPUT_FILE = os.path.join(BASE_DIR, "PII-MASTER-REPORT.txt")

SITES = {
    "BRH":              {"path": os.path.join(BASE_DIR, "BRH"),              "desc": "Banque de la Republique d'Haiti (brh.ht)"},
    "DGI-GOUV":         {"path": os.path.join(BASE_DIR, "DGI-GOUV"),         "desc": "Direction Generale des Impots (dgi.gouv.ht)"},
    "PRIMATURE-GOUV":   {"path": os.path.join(BASE_DIR, "PRIMATURE-GOUV"),   "desc": "Primature / Prime Minister (primature.gouv.ht)"},
    "DINEPA-GOUV":      {"path": os.path.join(BASE_DIR, "DINEPA-GOUV"),      "desc": "Direction Nationale Eau Potable (dinepa.gouv.ht)"},
    "MD-GOUV":          {"path": os.path.join(BASE_DIR, "MD-GOUV"),          "desc": "Ministere de la Diaspora (md.gouv.ht)"},
    "CONATEL-GOUV":     {"path": os.path.join(BASE_DIR, "CONATEL-GOUV"),     "desc": "Conseil National Telecom (conatel.gouv.ht)"},
    "COMMUNICATION-GOUV":{"path": os.path.join(BASE_DIR, "COMMUNICATION-GOUV"),"desc": "Ministere de la Communication (communication.gouv.ht)"},
    "MAE-GOUV":         {"path": os.path.join(BASE_DIR, "MAE-GOUV"),         "desc": "Ministere Affaires Etrangeres (mae.gouv.ht)"},
    "IGF-GOUV":         {"path": os.path.join(BASE_DIR, "IGF-GOUV"),         "desc": "Inspection Generale des Finances (igf.gouv.ht)"},
    "MSPP-GOUV":        {"path": os.path.join(BASE_DIR, "MSPP-GOUV"),        "desc": "Ministere Sante Publique (mspp.gouv.ht)"},
}

# ========================================================================
# GLOBAL STORAGE: value -> set of (site, filename, field_path) tuples
# ========================================================================
emails_found      = defaultdict(set)   # email -> {(site, file, field), ...}
phones_found      = defaultdict(set)
addresses_found   = defaultdict(set)
names_found       = defaultdict(set)
nif_cin_found     = defaultdict(set)
wp_users_found    = defaultdict(set)   # formatted user string -> {(site, file, field), ...}
gravatar_found    = defaultdict(set)   # hash -> {(site, file, field), ...}
donor_records     = []                 # list of dicts with full donor info + provenance

# ========================================================================
# CLEANING HELPERS
# ========================================================================
def strip_html(text):
    """Remove HTML tags and decode HTML entities."""
    if not text:
        return ""
    # Decode unicode escapes (\uXXXX) — only the \uXXXX patterns, not raw bytes
    pass
    text = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), text)
    text = re.sub(r'\\/', '/', text)
    # Strip HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Decode HTML entities
    text = html.unescape(text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

# ========================================================================
# EXCLUSION LISTS
# ========================================================================
EMAIL_EXCLUDE = [
    'wordpress@', 'noreply@', 'no-reply@', 'admin@example',
    'wapuu@wordpress', 'changeme@', 'your@email', 'email@example',
    'youremail@', 'test@test', 'test@example', 'user@example',
    'someone@example', 'name@example', 'info@developer',
    '1@developer', 'developer@developer', '@developer.developer',
    '@developer.', 'developer@developer', 'developer@developer.developer',
    '@developer.developer', 'developer.developer', 'developer@',
    'developer.developer@', '@developer', 'developer@developer.com',
]

FILE_EXT_NOISE = [
    '.png', '.jpg', '.jpeg', '.gif', '.svg', '.woff', '.woff2',
    '.ttf', '.eot', '.css', '.js', '.php', '.html', '.xml',
    '.mp4', '.webp', '.ico', '.map', '.min',
]

# ========================================================================
# EXTRACTION FUNCTIONS
# ========================================================================
def extract_emails(text, site, filename, field):
    """Extract email addresses from text, track provenance."""
    pattern = r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}'
    for match in re.finditer(pattern, text):
        email = match.group()
        e_lower = email.lower()
        # Skip system/boilerplate
        skip = False
        for ex in EMAIL_EXCLUDE:
            if ex in e_lower:
                skip = True
                break
        # Skip file extension false positives
        for ext in FILE_EXT_NOISE:
            if e_lower.endswith(ext):
                skip = True
                break
        if 'developer.developer' in e_lower:
            skip = True
        # Skip if it looks like a CSS/JS path segment
        if re.search(r'\d+x\d+', email):
            skip = True
        if not skip:
            emails_found[email].add((site, filename, field))


def extract_phones(text, site, filename, field):
    """Extract Haitian and international phone numbers."""
    clean = strip_html(text)
    patterns = [
        # +509 XXXX XXXX / +509-XXXX-XXXX etc.
        (r'\+509[\s\-\.]*\d{4}[\s\-\.]*\d{4}', 'haiti_intl'),
        # 509 XXXX XXXX
        (r'(?<!\d)509[\s\-\.]+\d{4}[\s\-\.]*\d{4}', 'haiti_509'),
        # (509) XXXX-XXXX
        (r'\(509\)[\s\-\.]*\d{4}[\s\-\.]*\d{4}', 'haiti_parens'),
        # US numbers
        (r'\+1[\s\-\.]*\d{3}[\s\-\.]*\d{3}[\s\-\.]*\d{4}', 'us_intl'),
        # Local 8-digit in tel/phone context
        (r'(?:T[eé]l|Phone|t[eé]l[eé]phone|Fax|Cell|GSM|Mobile|Appel)\s*[:\.\s]+\s*(\d{4}[\s\-\.]\d{4})', 'local_context'),
        # 8-digit starting with 2,3,4 near tel/phone context
        (r'(?:T[eé]l|Phone|Fax|Cell|Contact)\s*[:\.\s]*\s*([2-4]\d{3}[\s\-\.]?\d{4})', 'local_234'),
    ]
    for pat, ptype in patterns:
        for m in re.finditer(pat, clean, re.IGNORECASE):
            match_str = m.group(1) if m.lastindex else m.group()
            # Context check: skip if near image/dimension keywords
            start = max(0, m.start() - 40)
            ctx = clean[start:m.start()].lower()
            if any(x in ctx for x in ['srcset', 'width', 'height', 'sizes', 'wp-content',
                                        '.png', '.jpg', 'uploads/', 'content/', 'bp ',
                                        'image-', 'media/', 'thumb']):
                continue
            digits = re.sub(r'\D', '', match_str)
            if len(digits) >= 7:
                phones_found[match_str.strip()].add((site, filename, field))


def extract_addresses(text, site, filename, field):
    """Extract physical addresses (Haitian format)."""
    clean = strip_html(text)
    patterns = [
        # Street: Rue/Avenue/etc followed by proper name
        r'(?:Rue|Avenue|Boulevard|Blvd|Impasse|Ruelle|Route)\s+(?:de\s+la\s+|de\s+|du\s+|des\s+)?[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff][^,\n\"]{3,80}',
        # Angle (intersection)
        r'Angle\s+(?:Rue|Avenue|Boulevard|des\s+rues|rues)[^,\n\"]{5,80}',
        # Numbered Delmas
        r'Delmas\s+\d+[^,\n\"]{0,60}',
        # City + Haiti
        r'(?:Port-au-Prince|P[e\u00e9]tion[\-\s]?Ville|Tabarre|Jacmel|Cap[\-\s]?Ha[i\u00ef]tien|Gona[i\u00ef]ves|Les\s+Cayes|Hinche|J[e\u00e9]r[e\u00e9]mie|Kenscoff|Carrefour|Fort[\-\s]?Libert[e\u00e9]|Mirago[a\u00e2]ne|Port[\-\s]de[\-\s]Paix),?\s+Ha[i\u00ef]ti',
        # PO Box
        r'(?:B\.?P\.?\s*:?\s*\d+|Bo[i\u00ee]te\s+Postale\s*:?\s*\d+)[^"\n]{0,50}',
        # Full address with # number
        r'#?\s*\d+,?\s*(?:Rue|Avenue|Boulevard|Impasse|Route)\s+[A-Z\u00c0-\u0178][^,\n\"]{5,80}',
    ]
    for p in patterns:
        for m in re.finditer(p, clean, re.IGNORECASE):
            addr = m.group().strip()
            addr = re.sub(r'\s+', ' ', addr)
            # Reject if clearly JSON/code noise
            if any(x in addr for x in ['og_', 'isPartOf', '@id', '@type', 'robots', 'href',
                                         'https://', 'http://', 'wp-content', 'wp-json',
                                         'srcset', 'sizes=', 'aria-', '.png', '.jpg',
                                         '.json', 'datePublished', 'dateModified', '{', '}',
                                         'elementor', 'schema.org', 'ReadAction', 'potentialAction',
                                         'BreadcrumbList', 'ListItem', 'WebSite']):
                continue
            if 12 < len(addr) < 160:
                addresses_found[addr].add((site, filename, field))


def extract_named_persons(text, site, filename, field):
    """Extract named individuals by title prefix."""
    clean = strip_html(text)
    patterns = [
        # Title + full name (government officials)
        r'(?:Directeur|Directrice|Ministre|Secr[e\u00e9]taire|Gouverneur|Pr[e\u00e9]sident|Vice[\-\s]?Pr[e\u00e9]sident|Commissaire|Inspecteur|Ambassadeur|Consul|S[e\u00e9]nateur|D[e\u00e9]put[e\u00e9]|Maire|Juge|Procureur|Greffier|Notaire|Avocat|Comptable|Tr[e\u00e9]sorier|Chef|Coordonnateur|Coordonnatrice|Responsable|Recteur|Doyen|Conseiller|Conseill[e\u00e8]re|Administrateur|Administratrice|D[e\u00e9]l[e\u00e9]gu[e\u00e9]|Commandant|Colonel|G[e\u00e9]n[e\u00e9]ral|Commissaire)[\s:]+(?:G[e\u00e9]n[e\u00e9]ral(?:e)?[\s:]+)?(?:de\s+(?:la\s+)?|du\s+|des\s+|Adjoint(?:e)?[\s:]+)?[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,5}',
        # Honorifics: Dr., Ing., Me (Maitre), M., Mme, Prof.
        r'(?:Dr\.|Ing\.|Me\s|M\.\s|Mme\.?\s|Prof\.|Mgr\.|Sr\.|Pasteur|P[e\u00e8]re)\s*[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,5}',
        # Monsieur/Madame + name
        r'(?:Monsieur|Madame|Mademoiselle)\s+(?:le\s+|la\s+)?(?:Ministre|Directeur|Pr[e\u00e9]sident|Gouverneur|S[e\u00e9]nateur|Premier\s+Ministre)?\s*[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,5}',
        # Premier Ministre + name
        r'Premier\s+Ministre\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,4}',
        # "Son Excellence" pattern common in Haitian government docs
        r'Son\s+Excellence\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,4}',
        # "l'honorable" pattern
        r"[Ll]'[Hh]onorable\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ff]+(?:\s+[A-Z\u00c0-\u0178][a-z\u00e0-\u00ffA-Z\u00c0-\u0178\-]+){0,4}",
    ]
    for p in patterns:
        for m in re.finditer(p, clean):
            name = m.group().strip()
            name = re.sub(r'\s+', ' ', name)
            # Filter out noise
            if any(x in name for x in ['og_', 'href', 'http', 'wp-', '{', '}', 'srcset',
                                         'elementor', 'datePublished', 'robots', 'schema',
                                         'ReadAction', 'BreadcrumbList', 'WebSite', 'ImageObject',
                                         'potentialAction', 'SearchAction', '.com', '.org',
                                         '.ht', 'EntryPoint']):
                continue
            if 5 < len(name) < 130:
                names_found[name].add((site, filename, field))


def extract_nif_cin(text, site, filename, field):
    """Extract NIF (tax ID) and CIN (national ID card) references."""
    clean = strip_html(text)
    patterns = [
        # NIF with actual digits: NIF: 000-000-000 or NIF 123456789
        r'(?:NIF|N\.I\.F\.?)[\s:]*[\d][\d\s\-]{4,20}',
        # CIN with digits
        r'(?:CIN|C\.I\.N\.?|NIN|N\.I\.N\.?)[\s:]*[\d][\d\s\-]{4,20}',
        # Combined NIF/CIN with digits
        r'(?:NIF\s*/\s*CIN|CIN\s*/\s*NIF)[\s:]*[\d][\d\s\-]{4,20}',
        # NIF format XXX-XXX-XXX (3 groups of 3 digits)
        r'\b\d{3}-\d{3}-\d{3}\b',
        # NIF/CIN in procedural context (mentions needing the document)
        r'(?:NIF|CIN|NIF/CIN|NIF\s*/\s*CIN|Num[e\u00e9]ro\s+(?:d\'identification\s+fiscale|d\'identit[e\u00e9]\s+nationale))(?:\s+(?:du|de\s+la|de|des|ou|est|du\s+contribuable))?\s+[^.\n\"]{5,60}',
    ]
    for p in patterns:
        for m in re.finditer(p, clean, re.IGNORECASE):
            val = m.group().strip()
            val = re.sub(r'\s+', ' ', val)
            if any(x in val for x in ['og_', 'href', 'http', 'wp-', '{', '}', 'srcset', 'elementor']):
                continue
            if 3 < len(val) < 100:
                nif_cin_found[val].add((site, filename, field))


def extract_gravatar_hashes(text, site, filename, field):
    """Extract Gravatar MD5/SHA256 hashes (can be reversed to emails)."""
    for m in re.finditer(r'gravatar\.com/avatar/([a-f0-9]{32,64})', text):
        gravatar_found[m.group(1)].add((site, filename, field))


# ========================================================================
# RECURSIVE JSON TRAVERSAL
# ========================================================================
def traverse_json(obj, site, filename, path="$", depth=0):
    """Recursively walk a JSON structure, extracting PII from all string values."""
    if depth > 30:
        return

    if isinstance(obj, dict):
        for key, value in obj.items():
            new_path = f"{path}.{key}"
            traverse_json(value, site, filename, new_path, depth + 1)
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            new_path = f"{path}[{i}]"
            traverse_json(item, site, filename, new_path, depth + 1)
    elif isinstance(obj, str):
        if len(obj) < 3:
            return
        # For rendered HTML fields, strip tags first
        if 'rendered' in path.lower() or 'content' in path.lower() or 'yoast_head' in path.lower():
            text = strip_html(obj)
        else:
            text = obj

        # Run all extractors
        extract_emails(text, site, filename, path)
        extract_phones(text, site, filename, path)
        extract_addresses(text, site, filename, path)
        extract_named_persons(text, site, filename, path)
        extract_nif_cin(text, site, filename, path)
        extract_gravatar_hashes(obj, site, filename, path)  # Use raw for URL patterns


# ========================================================================
# STRUCTURED DATA EXTRACTION (WP Users, GiveWP, Comments)
# ========================================================================
def extract_wp_users(data, site, filename):
    """Extract WP user objects with full details."""
    if not isinstance(data, list):
        return
    for item in data:
        if not isinstance(item, dict):
            continue
        # WP user objects have name + slug + avatar_urls
        if 'slug' in item and 'avatar_urls' in item:
            uid = item.get('id', '?')
            name = item.get('name', '').strip()
            slug = item.get('slug', '').strip()
            desc = item.get('description', '').strip()
            link = item.get('link', '').strip()
            url = item.get('url', '').strip()

            if not name or not slug:
                continue

            entry = f"ID:{uid} | {name} (slug: {slug})"
            if desc:
                entry += f" | Bio: {desc[:120]}"
            if link:
                entry += f" | Profile: {link}"
            if url:
                entry += f" | URL: {url}"

            # Extract gravatar hash
            avatars = item.get('avatar_urls', {})
            for avatar_url in avatars.values():
                ghash = re.search(r'avatar/([a-f0-9]{32,64})', str(avatar_url))
                if ghash:
                    entry += f" | Gravatar: {ghash.group(1)}"
                    gravatar_found[ghash.group(1)].add((site, filename, "avatar_urls"))
                    break

            wp_users_found[entry].add((site, filename, "user_object"))


def _safe_str(val):
    """Safely convert a value to stripped string (handles None)."""
    if val is None:
        return ''
    return str(val).strip()


def extract_givewp_donors(data, site, filename):
    """Extract GiveWP donor records with full PII."""
    if not isinstance(data, list):
        return
    junk_re = re.compile(r'^[a-z]{1,6}\s+[a-z]$', re.IGNORECASE)
    for d in data:
        if not isinstance(d, dict):
            continue
        name = _safe_str(d.get('name'))
        first = _safe_str(d.get('firstName'))
        last = _safe_str(d.get('lastName'))
        email = _safe_str(d.get('email'))
        phone = _safe_str(d.get('phone'))
        company = _safe_str(d.get('company'))
        prefix = _safe_str(d.get('prefix'))
        addr = d.get('addresses', '')
        addr = addr.strip() if isinstance(addr, str) else ''
        extra_emails = d.get('additionalEmails', []) or []
        amount_obj = d.get('totalAmountDonated', {}) or {}
        amount = amount_obj.get('value', '') if isinstance(amount_obj, dict) else ''
        currency = amount_obj.get('currency', 'USD') if isinstance(amount_obj, dict) else 'USD'
        created = _safe_str(d.get('createdAt'))
        donor_id = d.get('id', '?')

        if not name:
            name = f"{first} {last}".strip()

        # Filter junk test entries
        if not name or len(name) <= 3:
            continue
        if junk_re.match(name):
            continue
        if name.lower() in ('test t', 'first name l', 'test test', 'asdf asdf'):
            continue

        record = {
            'type': 'donor',
            'donor_id': donor_id,
            'name': name,
            'prefix': prefix,
            'email': email,
            'phone': phone,
            'company': company,
            'address': addr,
            'additional_emails': extra_emails if extra_emails else [],
            'amount': f"{amount} {currency}" if amount else '',
            'date': created,
            'site': site,
            'file': filename,
        }
        donor_records.append(record)

        # Also feed emails/phones into global trackers
        if email:
            emails_found[email].add((site, filename, "donor.email"))
        if phone:
            phones_found[phone].add((site, filename, "donor.phone"))
        for extra_e in (extra_emails or []):
            if extra_e:
                emails_found[extra_e].add((site, filename, "donor.additionalEmails"))


def extract_givewp_donations(data, site, filename):
    """Extract GiveWP donation records with PII."""
    if not isinstance(data, list):
        return
    junk_re = re.compile(r'^[a-z]{1,6}\s+[a-z]$', re.IGNORECASE)
    for d in data:
        if not isinstance(d, dict):
            continue
        fn = _safe_str(d.get('firstName'))
        ln = _safe_str(d.get('lastName'))
        email = _safe_str(d.get('email'))
        phone = _safe_str(d.get('phone'))
        honorific = _safe_str(d.get('honorific'))
        comment = _safe_str(d.get('comment'))
        billing = d.get('billingAddress', '') or ''
        company = _safe_str(d.get('company'))
        amount_obj = d.get('amount', {}) or {}
        amount = amount_obj.get('value', '') if isinstance(amount_obj, dict) else ''
        currency = amount_obj.get('currency', 'USD') if isinstance(amount_obj, dict) else 'USD'
        created = _safe_str(d.get('createdAt'))
        donation_id = d.get('id', '?')
        donor_ip = _safe_str(d.get('donorIp'))
        gateway = _safe_str(d.get('gatewayId'))
        form_title = _safe_str(d.get('formTitle'))

        name = f"{fn} {ln}".strip()
        if not name or len(name) <= 3:
            continue
        if junk_re.match(name):
            continue

        record = {
            'type': 'donation',
            'donation_id': donation_id,
            'name': name,
            'honorific': honorific,
            'email': email,
            'phone': phone,
            'company': company,
            'billing_address': billing if isinstance(billing, str) else json.dumps(billing),
            'comment': comment,
            'amount': f"{amount} {currency}" if amount else '',
            'gateway': gateway,
            'form': form_title,
            'donor_ip': donor_ip,
            'date': created,
            'site': site,
            'file': filename,
        }
        donor_records.append(record)

        if email:
            emails_found[email].add((site, filename, "donation.email"))
        if phone:
            phones_found[phone].add((site, filename, "donation.phone"))


def extract_comments(data, site, filename):
    """Extract commenter information from WP comments."""
    if not isinstance(data, list):
        return
    for c in data:
        if not isinstance(c, dict):
            continue
        author = c.get('author_name', '').strip()
        author_url = c.get('author_url', '').strip()
        comment_id = c.get('id', '?')

        # Gravatar from avatar URLs
        avatar = c.get('author_avatar_urls', {})
        ghash = ''
        for url_val in avatar.values():
            m = re.search(r'avatar/([a-f0-9]{32,64})', str(url_val))
            if m:
                ghash = m.group(1)
                gravatar_found[ghash].add((site, filename, "comment.avatar"))
                break

        if author and author.lower() not in ('admin', 'anonymous', 'test', '', 'a commenter'):
            entry = f"Commenter: {author} (comment #{comment_id})"
            if author_url:
                entry += f" | URL: {author_url}"
            if ghash:
                entry += f" | Gravatar: {ghash}"
            wp_users_found[entry].add((site, filename, "comment"))


# ========================================================================
# FILE PROCESSING
# ========================================================================
def process_json_file(filepath, site):
    """Process a single JSON file: structured extraction + recursive text scan."""
    filename = os.path.basename(filepath)

    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            raw = f.read()
    except Exception as e:
        print(f"  [ERROR] Cannot read {filepath}: {e}", file=sys.stderr)
        return False

    # Skip files that are clearly error responses or HTTP headers
    if raw.strip().startswith('HTTP/') or '"code":"rest_' in raw[:200]:
        return False

    # Try to parse as JSON
    try:
        data = json.loads(raw)
    except json.JSONDecodeError:
        # Not valid JSON — still try regex on raw text
        extract_emails(raw, site, filename, "raw_text")
        extract_phones(raw, site, filename, "raw_text")
        extract_addresses(raw, site, filename, "raw_text")
        extract_named_persons(raw, site, filename, "raw_text")
        extract_nif_cin(raw, site, filename, "raw_text")
        extract_gravatar_hashes(raw, site, filename, "raw_text")
        return True

    # ---- Structured extraction ----
    fname_lower = filename.lower()

    # WP Users
    if 'user' in fname_lower:
        extract_wp_users(data, site, filename)

    # GiveWP donors
    if 'donor' in fname_lower:
        extract_givewp_donors(data, site, filename)

    # GiveWP donations
    if 'donation' in fname_lower:
        extract_givewp_donations(data, site, filename)

    # Comments
    if 'comment' in fname_lower:
        extract_comments(data, site, filename)

    # ---- Recursive JSON traversal for all text content ----
    traverse_json(data, site, filename)

    return True


# ========================================================================
# MAIN
# ========================================================================
def main():
    print("=" * 70)
    print("  HAITI GOV PII MASTER EXTRACTOR")
    print(f"  Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 70)

    total_files = 0
    site_stats = {}

    for site_key, site_info in SITES.items():
        site_path = site_info['path']
        site_desc = site_info['desc']

        if not os.path.isdir(site_path):
            print(f"\n  [{site_key}] Directory not found: {site_path} — SKIPPING")
            continue

        # Find ALL .json files (also scan subdirectories)
        json_files = glob.glob(os.path.join(site_path, "**", "*.json"), recursive=True)
        if not json_files:
            print(f"\n  [{site_key}] No JSON files found — SKIPPING")
            site_stats[site_key] = 0
            continue

        print(f"\n  [{site_key}] Scanning {len(json_files)} JSON files...")
        processed = 0
        for jf in sorted(json_files):
            ok = process_json_file(jf, site_key)
            if ok:
                processed += 1

        site_stats[site_key] = processed
        total_files += processed
        print(f"    Processed: {processed} files")

    # ========================================================================
    # GENERATE MASTER REPORT
    # ========================================================================
    print(f"\n{'='*70}")
    print(f"  Generating master report: {OUTPUT_FILE}")
    print(f"{'='*70}")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out:
        out.write("=" * 90 + "\n")
        out.write("  HAITI GOVERNMENT WEBSITES — PII MASTER EXTRACTION REPORT\n")
        out.write("=" * 90 + "\n")
        out.write(f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        out.write(f"  Total JSON files processed: {total_files}\n")
        out.write(f"  Sites scanned: {len([s for s in site_stats if site_stats[s] > 0])}\n")
        out.write(f"  Source: Public unauthenticated API endpoints (authorized OSINT assessment)\n")
        out.write("=" * 90 + "\n\n")

        # Per-site summary
        out.write("-" * 90 + "\n")
        out.write("  SITE SUMMARY\n")
        out.write("-" * 90 + "\n")
        for sk, si in SITES.items():
            count = site_stats.get(sk, 0)
            out.write(f"  {sk:25s}  {count:4d} files  |  {si['desc']}\n")
        out.write("\n")

        # ================================================================
        # SECTION 1: WORDPRESS USERS & AUTHORS
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [1] WORDPRESS USERS / AUTHORS / COMMENTERS  ({len(wp_users_found)} unique entries)\n")
        out.write("=" * 90 + "\n\n")

        # Group by site
        users_by_site = defaultdict(list)
        for entry, sources in sorted(wp_users_found.items()):
            for (s, f, fld) in sources:
                users_by_site[s].append((entry, f))

        for site_key in SITES:
            entries = users_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ({SITES[site_key]['desc']}) ---\n")
            seen = set()
            for entry, fname in sorted(set(entries)):
                if entry not in seen:
                    out.write(f"    {entry}\n")
                    out.write(f"      Source: {fname}\n")
                    seen.add(entry)
            out.write("\n")

        # ================================================================
        # SECTION 2: DONOR / DONATION RECORDS (GiveWP)
        # ================================================================
        if donor_records:
            out.write("=" * 90 + "\n")
            out.write(f"  [2] GIVEWP DONOR / DONATION RECORDS  ({len(donor_records)} entries)\n")
            out.write("=" * 90 + "\n\n")

            # De-duplicate by name+email+date
            seen_donors = set()
            unique_donors = []
            for rec in donor_records:
                key = (rec['name'], rec.get('email',''), rec.get('date',''))
                if key not in seen_donors:
                    seen_donors.add(key)
                    unique_donors.append(rec)

            out.write(f"  (De-duplicated: {len(unique_donors)} unique records)\n\n")

            # Group by site
            donors_by_site = defaultdict(list)
            for rec in unique_donors:
                donors_by_site[rec['site']].append(rec)

            for site_key in SITES:
                recs = donors_by_site.get(site_key, [])
                if not recs:
                    continue
                out.write(f"  --- {site_key} ({SITES[site_key]['desc']}) ---\n")
                for rec in recs:
                    rtype = rec['type'].upper()
                    line = f"    [{rtype}] {rec['name']}"
                    if rec.get('honorific'):
                        line += f" ({rec['honorific']})"
                    if rec.get('email'):
                        line += f" | Email: {rec['email']}"
                    if rec.get('phone'):
                        line += f" | Phone: {rec['phone']}"
                    if rec.get('company'):
                        line += f" | Company: {rec['company']}"
                    if rec.get('address'):
                        line += f" | Addr: {rec['address']}"
                    if rec.get('billing_address') and rec['billing_address'] not in ('', 'null', 'None'):
                        line += f" | Billing: {rec['billing_address']}"
                    if rec.get('amount'):
                        line += f" | Amount: {rec['amount']}"
                    if rec.get('donor_ip'):
                        line += f" | IP: {rec['donor_ip']}"
                    if rec.get('date'):
                        line += f" | Date: {rec['date']}"
                    out.write(f"{line}\n")
                    out.write(f"      Source: {rec['file']}\n")
                out.write("\n")

        # ================================================================
        # SECTION 3: EMAIL ADDRESSES
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [3] EMAIL ADDRESSES  ({len(emails_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        emails_by_site = defaultdict(list)
        for email, sources in sorted(emails_found.items(), key=lambda x: x[0].lower()):
            for (s, f, fld) in sources:
                emails_by_site[s].append((email, f, fld))

        for site_key in SITES:
            entries = emails_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for email, fname, fld in sorted(set(entries)):
                if email not in seen:
                    # Collect all files this email appears in for this site
                    all_files = set()
                    for e2, f2, fld2 in entries:
                        if e2 == email:
                            all_files.add(f2)
                    out.write(f"    {email}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(email)
            out.write("\n")

        # ================================================================
        # SECTION 4: PHONE NUMBERS
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [4] PHONE NUMBERS  ({len(phones_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        phones_by_site = defaultdict(list)
        for phone, sources in sorted(phones_found.items()):
            for (s, f, fld) in sources:
                phones_by_site[s].append((phone, f, fld))

        for site_key in SITES:
            entries = phones_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for phone, fname, fld in sorted(set(entries)):
                if phone not in seen:
                    all_files = set()
                    for p2, f2, fld2 in entries:
                        if p2 == phone:
                            all_files.add(f2)
                    out.write(f"    {phone}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(phone)
            out.write("\n")

        # ================================================================
        # SECTION 5: PHYSICAL ADDRESSES
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [5] PHYSICAL ADDRESSES  ({len(addresses_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        addrs_by_site = defaultdict(list)
        for addr, sources in sorted(addresses_found.items()):
            for (s, f, fld) in sources:
                addrs_by_site[s].append((addr, f))

        for site_key in SITES:
            entries = addrs_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for addr, fname in sorted(set(entries)):
                if addr not in seen:
                    all_files = set()
                    for a2, f2 in entries:
                        if a2 == addr:
                            all_files.add(f2)
                    out.write(f"    {addr}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(addr)
            out.write("\n")

        # ================================================================
        # SECTION 6: NAMED INDIVIDUALS (BY TITLE)
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [6] NAMED INDIVIDUALS / TITLED PERSONS  ({len(names_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        names_by_site = defaultdict(list)
        for name, sources in sorted(names_found.items()):
            for (s, f, fld) in sources:
                names_by_site[s].append((name, f))

        for site_key in SITES:
            entries = names_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for name, fname in sorted(set(entries)):
                if name not in seen:
                    all_files = set()
                    for n2, f2 in entries:
                        if n2 == name:
                            all_files.add(f2)
                    out.write(f"    {name}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(name)
            out.write("\n")

        # ================================================================
        # SECTION 7: NIF / CIN REFERENCES
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [7] NATIONAL ID REFERENCES (NIF/CIN)  ({len(nif_cin_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        ids_by_site = defaultdict(list)
        for nif, sources in sorted(nif_cin_found.items()):
            for (s, f, fld) in sources:
                ids_by_site[s].append((nif, f))

        for site_key in SITES:
            entries = ids_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for nif, fname in sorted(set(entries)):
                if nif not in seen:
                    all_files = set()
                    for n2, f2 in entries:
                        if n2 == nif:
                            all_files.add(f2)
                    out.write(f"    {nif}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(nif)
            out.write("\n")

        # ================================================================
        # SECTION 8: GRAVATAR HASHES
        # ================================================================
        out.write("=" * 90 + "\n")
        out.write(f"  [8] GRAVATAR HASHES (reversible to email)  ({len(gravatar_found)} unique)\n")
        out.write("=" * 90 + "\n\n")

        grav_by_site = defaultdict(list)
        for ghash, sources in sorted(gravatar_found.items()):
            for (s, f, fld) in sources:
                grav_by_site[s].append((ghash, f))

        for site_key in SITES:
            entries = grav_by_site.get(site_key, [])
            if not entries:
                continue
            out.write(f"  --- {site_key} ---\n")
            seen = set()
            for ghash, fname in sorted(set(entries)):
                if ghash not in seen:
                    all_files = set()
                    for g2, f2 in entries:
                        if g2 == ghash:
                            all_files.add(f2)
                    out.write(f"    {ghash}\n")
                    out.write(f"      Found in: {', '.join(sorted(all_files))}\n")
                    seen.add(ghash)
            out.write("\n")

        # ================================================================
        # GRAND TOTALS
        # ================================================================
        # De-dup donors
        seen_donor_keys = set()
        unique_donor_count = 0
        for rec in donor_records:
            key = (rec['name'], rec.get('email',''), rec.get('date',''))
            if key not in seen_donor_keys:
                seen_donor_keys.add(key)
                unique_donor_count += 1

        grand_total = (len(wp_users_found) + unique_donor_count + len(emails_found) +
                       len(phones_found) + len(addresses_found) + len(names_found) +
                       len(nif_cin_found) + len(gravatar_found))

        out.write("=" * 90 + "\n")
        out.write("  GRAND TOTALS\n")
        out.write("=" * 90 + "\n")
        out.write(f"  WP Users / Authors / Commenters:  {len(wp_users_found):>6}\n")
        out.write(f"  GiveWP Donor/Donation Records:    {unique_donor_count:>6}\n")
        out.write(f"  Email Addresses:                  {len(emails_found):>6}\n")
        out.write(f"  Phone Numbers:                    {len(phones_found):>6}\n")
        out.write(f"  Physical Addresses:               {len(addresses_found):>6}\n")
        out.write(f"  Named Individuals (titled):       {len(names_found):>6}\n")
        out.write(f"  NIF/CIN References:               {len(nif_cin_found):>6}\n")
        out.write(f"  Gravatar Hashes:                  {len(gravatar_found):>6}\n")
        out.write(f"  {'─'*42}{'─'*8}\n")
        out.write(f"  TOTAL UNIQUE PII ITEMS:           {grand_total:>6}\n")
        out.write("=" * 90 + "\n")
        out.write(f"\n  Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        out.write(f"  Source: Authorized OSINT assessment of public API endpoints\n")
        out.write("=" * 90 + "\n")

    # Console summary
    print(f"\n{'='*70}")
    print(f"  EXTRACTION COMPLETE")
    print(f"{'='*70}")
    print(f"  Total files processed:        {total_files}")
    print(f"  WP Users/Authors/Commenters:  {len(wp_users_found)}")
    print(f"  Donor/Donation Records:        {unique_donor_count}")
    print(f"  Emails:                        {len(emails_found)}")
    print(f"  Phones:                        {len(phones_found)}")
    print(f"  Addresses:                     {len(addresses_found)}")
    print(f"  Named Individuals:             {len(names_found)}")
    print(f"  NIF/CIN:                       {len(nif_cin_found)}")
    print(f"  Gravatar Hashes:               {len(gravatar_found)}")

    seen_dk = set()
    udc = 0
    for rec in donor_records:
        key = (rec['name'], rec.get('email',''), rec.get('date',''))
        if key not in seen_dk:
            seen_dk.add(key)
            udc += 1

    gt = (len(wp_users_found) + udc + len(emails_found) +
          len(phones_found) + len(addresses_found) + len(names_found) +
          len(nif_cin_found) + len(gravatar_found))
    print(f"  ───────────────────────────────")
    print(f"  GRAND TOTAL:                   {gt}")
    print(f"\n  Report written to: {OUTPUT_FILE}")


if __name__ == '__main__':
    main()
