#!/usr/bin/env python3
"""
Recovery script to parse XLSX and DOCX files that segfaulted on Windows.
Runs via WSL2 Ubuntu where libraries work correctly.
Outputs JSON results that the main script can merge.
"""

import os
import re
import sys
import json
from pathlib import Path
from collections import defaultdict

import openpyxl
from docx import Document as DocxDocument

BASE = "/mnt/c/Users/Squir/Desktop/HAITI/DUMP"

# Files that failed on Windows (exit code 3221225477)
FAILED_FILES = [
    # XLSX files (Douane)
    "DOUANE-GOUV/downloads/Liste-de-localisation-des-marchandises.xlsx",
    "DOUANE-GOUV/downloads/Liste-des-candidats-retenus-no_phone.xlsx",
    "DOUANE-GOUV/downloads/Liste-des-candidats-retenus.xlsx",
    "DOUANE-GOUV/downloads/Liste-des-conteneurs.xlsx",
    "DOUANE-GOUV/downloads/Liste-des-emballages-sydonia.xlsx",
    "DOUANE-GOUV/downloads/Liste-des-ports-internationaux-.xlsx",
    # DOCX files (DINEPA)
    "DINEPA-GOUV/downloads/RPF-EPARD-II-Fevrier-2023.docx",
    # DOCX files (MICT root)
    "MICT-GOUV/DG-Avis-de-recrutement-internes.docx",
    "MICT-GOUV/DOSSIER-PREQUALIFICATION-INTERIEUR.docx",
    "MICT-GOUV/Publication-Recrutement-avis.docx",
    # DOCX files (MICT documents)
    "MICT-GOUV/documents/Ouverture-Saison-Cyclonique-2020-Ministre-Interieur.docx",
    "MICT-GOUV/documents/PGES_Ravines-Belle-hotesse-et-Zetrier-MDUR-Revis-Dec-2018.docx",
    "MICT-GOUV/documents/Rapport_final_du_CPR_du_PGRAC_-_10____janvier_2019.docx",
    "MICT-GOUV/documents/REVISED-PAR-Ravines-Belle-Hotesse-et-Zetriye-Dec-13-FINAL-Approved.docx",
    # DOCX files (MICT uploads - duplicates of documents/)
    "MICT-GOUV/uploads/2019/01/Rapport_final_du_CPR_du_PGRAC_-_10____janvier_2019.docx",
    "MICT-GOUV/uploads/2019/02/PGES_Ravines-Belle-hotesse-et-Zetrier-MDUR-Revis-Dec-2018.docx",
    "MICT-GOUV/uploads/2019/02/REVISED-PAR-Ravines-Belle-Hotesse-et-Zetriye-Dec-13-FINAL-Approved.docx",
    "MICT-GOUV/uploads/2020/06/Ouverture-Saison-Cyclonique-2020-Ministre-Interieur.docx",
    "MICT-GOUV/uploads/2025/01/DG-Avis-de-recrutement-internes.docx",
    "MICT-GOUV/uploads/2025/01/Publication-Recrutement-avis.docx",
    "MICT-GOUV/uploads/2025/09/DOSSIER-PREQUALIFICATION-INTERIEUR.docx",
]

# Regex patterns
PATTERNS = {
    "email": (r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}', 0),
    "phone_haiti": (r'(?:\+509|509)[\s.\-]?\d{4}[\s.\-]?\d{4}', 0),
    "phone_general": (r'(?:\+\d{1,3}[\s.\-])?\(?\d{2,4}\)?[\s.\-]?\d{3,4}[\s.\-]?\d{3,4}', 0),
    "nif_tax": (r'\d{3}-\d{3}-\d{3}-\d{1}', 0),
    "cin_id": (r'(?:CIN|NIF|NIN)[:\s]*[\d\-]+', 0),
    "address": (r'(?:Rue|Avenue|Route|Boulevard|Blvd|Impasse|Ruelle)\s+[A-Z\u00C0-\u00DC][a-z\u00E0-\u00FC\s,.\-]+(?:Port-au-Prince|P[eé]tion-Ville|Delmas|Carrefour|Cap-Ha[iï]tien)?', 0),
    "named_person": (r'(?:Monsieur|Madame|M\.|Mme|Dr\.?|Ing\.?|Prof\.?)\s+[A-Z\u00C0-\u00DC][a-z\u00E0-\u00FC]+\s+[A-Z\u00C0-\u00DC][A-Z\u00C0-\u00DCa-z\u00E0-\u00FC]+', 0),
    "url": (r'https?://[^\s<>"\']+', 0),
    "credential": (r'(?:password|passwd|pwd|mot de passe|secret|token|api.?key)[:\s=]+\S+', re.IGNORECASE),
    "db_string": (r'(?:mysql|postgres|mongodb|redis)://[^\s<>"\']+', re.IGNORECASE),
    "ip_address": (r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 0),
}

EMAIL_JUNK = re.compile(r'(?:\.png|\.jpg|\.gif|\.svg|\.css|\.js|\.woff|\.ttf|\.pdf|\.docx|\.xlsx|\.php|\.html)$', re.IGNORECASE)
PHONE_JUNK = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$')

all_findings = defaultdict(list)
all_emails = set()
all_phones = set()
errors = []
raw_data = {}  # Store raw extracted text for interesting files

def scan_text(text, source_file, location):
    if not text or not text.strip():
        return
    for name, (pattern_str, flags) in PATTERNS.items():
        pat = re.compile(pattern_str, flags)
        for m in pat.finditer(text):
            val = m.group(0).strip()
            if name == "email" and (EMAIL_JUNK.search(val) or len(val) < 6):
                continue
            if name in ("phone_haiti", "phone_general") and (PHONE_JUNK.match(val) or len(val) < 7):
                continue
            all_findings[name].append((source_file, location, val))
            if name == "email":
                all_emails.add(val)
            if name in ("phone_haiti", "phone_general"):
                all_phones.add(val)


def process_xlsx(filepath, relname):
    print(f"  [XLSX] {relname}", flush=True)
    try:
        wb = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            row_count = 0
            sample_rows = []
            for row_idx, row in enumerate(ws.iter_rows(values_only=True), 1):
                row_count += 1
                if row:
                    row_text = " | ".join(str(c) if c is not None else "" for c in row)
                    scan_text(row_text, relname, f"sheet '{sheet_name}' row {row_idx}")
                    if row_idx <= 5:
                        sample_rows.append(row_text)
            print(f"    Sheet '{sheet_name}': {row_count} rows")
            if sample_rows:
                raw_data[f"{relname} - {sheet_name}"] = sample_rows
        wb.close()
        return True
    except Exception as e:
        errors.append((relname, str(e)))
        print(f"    ERROR: {e}")
        return False


def process_docx(filepath, relname):
    print(f"  [DOCX] {relname}", flush=True)
    try:
        doc = DocxDocument(filepath)
        parts = []
        for para in doc.paragraphs:
            if para.text.strip():
                parts.append(para.text)
        for table in doc.tables:
            for row in table.rows:
                row_text = " | ".join(cell.text for cell in row.cells)
                parts.append(row_text)
        if parts:
            combined = "\n".join(parts)
            scan_text(combined, relname, "full document")
            # Store first 10 lines as sample
            raw_data[relname] = parts[:10]
        print(f"    Extracted {len(parts)} text blocks")
        return True
    except Exception as e:
        errors.append((relname, str(e)))
        print(f"    ERROR: {e}")
        return False


def main():
    print("=" * 60)
    print("RECOVERY PARSER — Failed XLSX/DOCX via WSL2")
    print("=" * 60)

    processed = 0
    for relname in FAILED_FILES:
        filepath = os.path.join(BASE, relname)
        if not os.path.exists(filepath):
            print(f"  [SKIP] {relname} — file not found")
            continue
        ext = os.path.splitext(filepath)[1].lower()
        if ext == ".xlsx":
            if process_xlsx(filepath, relname):
                processed += 1
        elif ext == ".docx":
            if process_docx(filepath, relname):
                processed += 1

    # Output JSON results for merging
    output = {
        "processed": processed,
        "errors": errors,
        "findings": {k: v for k, v in all_findings.items()},
        "emails": sorted(all_emails),
        "phones": sorted(all_phones),
        "raw_data": raw_data,
    }

    output_path = os.path.join(BASE, "recovery_results.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n{'='*60}")
    print(f"RECOVERY RESULTS")
    print(f"{'='*60}")
    print(f"Files processed: {processed}")
    print(f"Errors: {len(errors)}")
    for name in sorted(all_findings.keys()):
        unique = set(v[2] for v in all_findings[name])
        print(f"  {name:35s} {len(all_findings[name]):5d} matches ({len(unique)} unique)")
    print(f"\n  Emails: {len(all_emails)}")
    print(f"  Phones: {len(all_phones)}")

    # Also print raw sample data for key files
    print(f"\n{'='*60}")
    print("SAMPLE DATA FROM KEY FILES")
    print(f"{'='*60}")
    for fname, rows in raw_data.items():
        print(f"\n--- {fname} ---")
        for row in rows:
            print(f"  {row[:200]}")

    print(f"\nResults saved to: {output_path}")


if __name__ == "__main__":
    main()
