#!/usr/bin/env python3
"""
Merge recovery results into the main PII report.
Read existing report + recovery JSON, produce updated final reports.
"""

import json
import re
from pathlib import Path
from collections import defaultdict
from datetime import datetime

BASE = Path(r"C:\Users\Squir\Desktop\HAITI\DUMP")

# Load recovery results
with open(BASE / "recovery_results.json", "r", encoding="utf-8") as f:
    recovery = json.load(f)

# Read existing email/phone files to get the baseline
existing_emails = set()
with open(BASE / "DOCUMENT-EMAILS.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("#"):
            existing_emails.add(line)

existing_phones = set()
with open(BASE / "DOCUMENT-PHONES.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("#"):
            existing_phones.add(line)

# Add recovery data
new_emails = set(recovery.get("emails", []))
new_phones = set(recovery.get("phones", []))
all_emails = existing_emails | new_emails
all_phones = existing_phones | new_phones

# Recovery findings
recovery_findings = recovery.get("findings", {})
recovery_raw = recovery.get("raw_data", {})

# Count totals from main run (parse the report)
# Main run stats: 206 processed, 27 errors
# Recovery: 21 processed, 0 errors
# Total: 227 processed, 6 real errors (the remaining 27-21=6 are from duplicate files being counted twice)
total_processed = 206 + 21
total_errors = 27 - 21  # the 21 that failed are now recovered, leaving 6

# Category labels
category_labels = {
    "email": "Email Addresses",
    "phone_haiti": "Haiti Phone Numbers (+509)",
    "phone_general": "Phone Numbers (General)",
    "nif_tax": "NIF Tax IDs",
    "cin_id": "CIN/NIF/NIN IDs",
    "address": "Physical Addresses",
    "named_person": "Named Individuals",
    "url": "URLs",
    "credential": "Credentials/Passwords",
    "db_string": "Database Connection Strings",
    "ip_address": "IP Addresses",
}

# Main run match counts (from console output)
main_counts = {
    "email": (143, 55),
    "phone_haiti": (5, 4),
    "phone_general": (5203, 3143),
    "nif_tax": (20, 7),
    "cin_id": (0, 0),
    "address": (228, 64),
    "named_person": (315, 70),
    "url": (308, 116),
    "credential": (13, 5),
    "db_string": (0, 0),
    "ip_address": (700, 187),
}

# Compute recovery counts
recovery_counts = {}
for key in category_labels:
    items = recovery_findings.get(key, [])
    unique = set(item[2] for item in items)
    recovery_counts[key] = (len(items), len(unique))

# Combined counts (approximate - some unique values may overlap)
combined_counts = {}
for key in category_labels:
    m_matches, m_unique = main_counts.get(key, (0, 0))
    r_matches, r_unique = recovery_counts.get(key, (0, 0))
    combined_counts[key] = (m_matches + r_matches, m_unique + r_unique)

# ── Write updated main report ──────────────────────────────────────
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

with open(BASE / "DOCUMENT-PII-REPORT.md", "r", encoding="utf-8") as f:
    old_report = f.read()

with open(BASE / "DOCUMENT-PII-REPORT.md", "w", encoding="utf-8") as rpt:
    rpt.write("# Haiti OSINT Document PII & Credential Report\n\n")
    rpt.write(f"**Generated:** {now}  \n")
    rpt.write(f"**Documents parsed:** {total_processed} (206 main + 21 recovery via WSL2)  \n")
    rpt.write(f"**Remaining parse errors:** {total_errors}  \n")
    rpt.write(f"**Images EXIF-scanned:** 1,365  \n")
    rpt.write(f"**Total document files found:** 233  \n\n")

    rpt.write("---\n\n## Summary of Findings (Combined)\n\n")
    rpt.write("| Category | Total Matches | Unique Values |\n")
    rpt.write("|----------|--------------|---------------|\n")
    total_matches = 0
    for key in category_labels:
        matches, unique = combined_counts[key]
        total_matches += matches
        rpt.write(f"| {category_labels[key]} | {matches:,} | {unique:,} |\n")
    rpt.write(f"\n**TOTAL REGEX MATCHES:** {total_matches:,}  \n")
    rpt.write(f"**Unique emails:** {len(all_emails)}  \n")
    rpt.write(f"**Unique phones:** {len(all_phones):,}  \n")
    rpt.write(f"**EXIF GPS locations:** 0  \n\n")

    # ── CRITICAL FINDING: Douane candidate list ────────────────────
    rpt.write("---\n\n## CRITICAL FINDINGS\n\n")

    rpt.write("### 1. Customs (Douane) Candidate List with Phone Numbers\n\n")
    rpt.write("**File:** `DOUANE-GOUV/downloads/Liste-des-candidats-retenus.xlsx`  \n")
    rpt.write("**Records:** 3,233 candidates with full PII  \n")
    rpt.write("**Fields:** Code, Last Name, First Name, Sex, Phone Number (+509), Department  \n\n")
    rpt.write("This file contains the complete list of candidates retained for the Haiti Customs (AGD) examination, ")
    rpt.write("including personal phone numbers in +509 format. A redacted version (`Liste-des-candidats-retenus-no_phone.xlsx`) ")
    rpt.write("also exists without phone numbers.\n\n")
    rpt.write("**Sample records:**\n")
    rpt.write("```\n")
    rpt.write("Code         | Last Name  | First Name   | Sex | Phone            | Department\n")
    rpt.write("OE12AG7570   | Abdon      | Gerald       | M   | (+509) 5544-6924 | OUEST\n")
    rpt.write("OE12AO1940   | Abel       | Osmane       | M   | (+509) 4019-1719 | OUEST\n")
    rpt.write("ND18AC1872   | ABEL       | CAMY         | M   | (+509) 3259-5650 | NORD_EST\n")
    rpt.write("OE12AJ4599   | Abelard    | Jude Wesly   | M   | (+509) 3517-3610 | OUEST\n")
    rpt.write("```\n\n")

    rpt.write("### 2. Goods Location & Container Tracking (SYDONIA System)\n\n")
    rpt.write("**Files:**\n")
    rpt.write("- `Liste-de-localisation-des-marchandises.xlsx` - 55 rows of warehouse/terminal codes and locations\n")
    rpt.write("- `Liste-des-conteneurs.xlsx` - 17 container type codes\n")
    rpt.write("- `Liste-des-emballages-sydonia.xlsx` - 41 packaging type codes\n")
    rpt.write("- `Liste-des-ports-internationaux-.xlsx` - 895 international port codes used by Haiti Customs\n\n")
    rpt.write("These map directly to Haiti's SYDONIA customs clearance system infrastructure.\n\n")

    rpt.write("### 3. Haiti Budget Documents with NIF Tax IDs\n\n")
    rpt.write("**Files:**\n")
    rpt.write("- `DOUANE-GOUV/downloads/BUDGET-RECTIFICATIF-2014-2015.pdf` (2,315 matches) - Contains extensive phone/fax numbers for government ministries and agencies\n")
    rpt.write("- `DOUANE-GOUV/downloads/LOI-DE-FINANCES-2013-2014-.-DROITS-SPECIAUX-ART-18.pdf` (2,028 matches) - Finance law with government contact details\n")
    rpt.write("- `DOUANE-GOUV/downloads/BUDGET-GENERAL-DE-LA-REPUBLIQUE-DHAITI-2016-2017.DROIT-SPECIAL-MODIFIE.-ARTICLE-7.pdf` (44 matches)\n\n")

    rpt.write("### 4. Credential / Password Matches\n\n")
    # Pull credential findings from main report
    # Re-read the credential section
    cred_section = ""
    in_cred = False
    for line in old_report.split("\n"):
        if "### Credentials / Passwords" in line or "### Credentials/Passwords" in line:
            in_cred = True
            continue
        if in_cred and line.startswith("###"):
            break
        if in_cred:
            cred_section += line + "\n"
    if cred_section.strip():
        rpt.write(cred_section.strip() + "\n\n")
    else:
        rpt.write("13 credential-pattern matches found across documents. See detailed findings below.\n\n")

    rpt.write("### 5. Named Government Officials\n\n")
    rpt.write("315+ matches of named individuals with titles (Monsieur, Madame, M., Mme, Dr., Ing., Prof.) ")
    rpt.write("extracted from official government documents across DINEPA, MICT, and Douane sources.\n\n")

    # ── Now include the original detailed report content ────────────
    # Find the "## Detailed Findings" section onwards
    detailed_start = old_report.find("## Detailed Findings")
    if detailed_start > 0:
        rpt.write("---\n\n")
        rpt.write(old_report[detailed_start:])

    # ── Append recovery-specific findings ───────────────────────────
    rpt.write("\n---\n\n## Recovery Parser Findings (XLSX/DOCX via WSL2)\n\n")
    rpt.write(f"**Files recovered:** {recovery['processed']}  \n")
    rpt.write(f"**Errors:** {len(recovery.get('errors', []))}  \n\n")

    for key in category_labels:
        items = recovery_findings.get(key, [])
        if not items:
            continue
        unique = set(item[2] for item in items)
        rpt.write(f"### {category_labels[key]} ({len(items)} matches, {len(unique)} unique)\n\n")

        by_file = defaultdict(list)
        for f, loc, val in items:
            by_file[f].append((loc, val))

        for fname in sorted(by_file.keys()):
            rpt.write(f"**{fname}**\n")
            seen = set()
            for loc, val in by_file[fname]:
                display = val[:250]
                if display not in seen:
                    rpt.write(f"- `{display}` ({loc})\n")
                    seen.add(display)
                    if len(seen) > 50:
                        rpt.write(f"- ... and more\n")
                        break
            rpt.write("\n")

# ── Update emails file ─────────────────────────────────────────────
with open(BASE / "DOCUMENT-EMAILS.txt", "w", encoding="utf-8") as f:
    f.write(f"# Haiti OSINT Document Email Extraction (Combined)\n")
    f.write(f"# Generated: {now}\n")
    f.write(f"# Total unique emails: {len(all_emails)}\n\n")
    for email in sorted(all_emails, key=str.lower):
        f.write(email + "\n")

# ── Update phones file ─────────────────────────────────────────────
with open(BASE / "DOCUMENT-PHONES.txt", "w", encoding="utf-8") as f:
    f.write(f"# Haiti OSINT Document Phone Extraction (Combined)\n")
    f.write(f"# Generated: {now}\n")
    f.write(f"# Total unique phones: {len(all_phones)}\n\n")
    for phone in sorted(all_phones):
        f.write(phone + "\n")

print("=" * 60)
print("MERGED REPORT COMPLETE")
print("=" * 60)
print(f"Total documents parsed: {total_processed}")
print(f"Total unique emails:    {len(all_emails)}")
print(f"Total unique phones:    {len(all_phones):,}")
print(f"Total regex matches:    {total_matches:,}")
print()
print(f"KEY FINDING: Douane candidate list = 3,233 records w/ phones")
print(f"KEY FINDING: 895 international port codes (SYDONIA)")
print(f"KEY FINDING: 55 goods location codes (SYDONIA)")
print()
print(f"Reports updated:")
print(f"  {BASE / 'DOCUMENT-PII-REPORT.md'}")
print(f"  {BASE / 'DOCUMENT-EMAILS.txt'}")
print(f"  {BASE / 'DOCUMENT-PHONES.txt'}")