#!/usr/bin/env python3
"""
Download documents from mict.gouv.ht WordPress uploads directory listings.
Parses locally-saved index HTML files, extracts document URLs, downloads them.
"""

import os
import re
import sys
import time
import urllib.parse
import urllib.request
import ssl
from html.parser import HTMLParser

BASE_URL = "https://mict.gouv.ht/wp-content/uploads/"
DUMP_DIR = r"C:\Users\Squir\Desktop\HAITI\DUMP\MICT-GOUV\uploads"
INDEX_DIR = r"C:\Users\Squir\Desktop\HAITI\DUMP\MICT-GOUV"
MANIFEST_FILE = os.path.join(INDEX_DIR, "older-uploads-manifest.txt")

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"

# Document extensions to download
DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.csv', '.txt', '.rtf'}

# WordPress thumbnail pattern: -NNNxNNN or -NNNxNNN_c before extension
THUMB_PATTERN = re.compile(r'-\d+x\d+(_c)?\.')

# Index files to process (year, month) -> index filename
INDEX_FILES = [
    (2016, 3, "uploads-2016-03-index.html"),
    (2016, 4, "uploads-2016-04-index.html"),
    (2019, 1, "uploads-2019-01-index.html"),
    (2019, 2, "uploads-2019-02-index.html"),
    (2020, 5, "uploads-2020-05-index.html"),
    (2020, 6, "uploads-2020-06-index.html"),
    (2024, 9, "uploads-2024-09-index.html"),
    (2025, 1, "uploads-2025-01-index.html"),
    (2025, 9, "uploads-2025-09-index.html"),
]


class LinkExtractor(HTMLParser):
    """Extract href links from HTML."""
    def __init__(self):
        super().__init__()
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr_name, attr_value in attrs:
                if attr_name == 'href' and attr_value:
                    self.links.append(attr_value)


def extract_links_from_index(index_path):
    """Parse an Apache directory listing HTML and return all href links."""
    with open(index_path, 'r', encoding='utf-8', errors='replace') as f:
        html = f.read()
    parser = LinkExtractor()
    parser.feed(html)
    return parser.links


def is_document(filename):
    """Check if filename is a document (not a thumbnail image)."""
    lower = filename.lower()
    # Check extension
    _, ext = os.path.splitext(lower)
    if ext not in DOC_EXTENSIONS:
        return False
    # Skip WordPress thumbnails
    if THUMB_PATTERN.search(filename):
        return False
    return True


def extract_size_from_index(index_path, filename):
    """Try to extract the file size from the directory listing HTML."""
    with open(index_path, 'r', encoding='utf-8', errors='replace') as f:
        html = f.read()
    # Look for the filename in the HTML and extract the size
    escaped = re.escape(filename)
    pattern = rf'href="{escaped}".*?align="right">\s*([\d.]+[KMG]?)\s*</td>'
    match = re.search(pattern, html)
    if match:
        return match.group(1).strip()
    return "unknown"


def download_file(url, dest_path, retries=3):
    """Download a file with retries and proper headers."""
    # Create SSL context that doesn't verify (some gov sites have cert issues)
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    headers = {
        'User-Agent': USER_AGENT,
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://mict.gouv.ht/',
    }

    req = urllib.request.Request(url, headers=headers)

    for attempt in range(retries):
        try:
            response = urllib.request.urlopen(req, context=ctx, timeout=60)
            data = response.read()
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            with open(dest_path, 'wb') as f:
                f.write(data)
            size = len(data)
            return True, size
        except Exception as e:
            print(f"  Attempt {attempt+1}/{retries} failed: {e}")
            if attempt < retries - 1:
                time.sleep(2 * (attempt + 1))
    return False, 0


def main():
    manifest_entries = []
    total_downloaded = 0
    total_failed = 0
    total_skipped = 0

    print("=" * 70)
    print("MICT.GOUV.HT - Older Uploads Document Downloader")
    print("=" * 70)

    for year, month, index_filename in INDEX_FILES:
        index_path = os.path.join(INDEX_DIR, index_filename)
        if not os.path.exists(index_path):
            print(f"\n[SKIP] Index file not found: {index_filename}")
            continue

        print(f"\n{'='*60}")
        print(f"Processing: {year}/{month:02d} ({index_filename})")
        print(f"{'='*60}")

        # Extract all links
        links = extract_links_from_index(index_path)

        # Filter to documents only
        doc_links = [link for link in links if is_document(link)]

        if not doc_links:
            print(f"  No documents found in {index_filename}")
            continue

        print(f"  Found {len(doc_links)} document(s)")

        # Download directory
        dest_dir = os.path.join(DUMP_DIR, str(year), f"{month:02d}")
        os.makedirs(dest_dir, exist_ok=True)

        for filename in doc_links:
            # Decode URL-encoded filename for local storage
            decoded_filename = urllib.parse.unquote(filename)
            # But use original for URL
            url = BASE_URL + f"{year}/{month:02d}/{filename}"
            dest_path = os.path.join(dest_dir, decoded_filename)

            # Get listed size from index
            listed_size = extract_size_from_index(index_path, filename)

            # Skip if already downloaded
            if os.path.exists(dest_path) and os.path.getsize(dest_path) > 0:
                existing_size = os.path.getsize(dest_path)
                print(f"  [SKIP] Already exists: {decoded_filename} ({existing_size} bytes)")
                manifest_entries.append(f"[EXISTS] {year}/{month:02d}/{decoded_filename} (listed: {listed_size})")
                total_skipped += 1
                continue

            print(f"  [DOWNLOAD] {decoded_filename} (listed: {listed_size}) ...")
            success, size = download_file(url, dest_path)

            if success:
                print(f"    -> OK ({size:,} bytes)")
                manifest_entries.append(f"[OK] {year}/{month:02d}/{decoded_filename} ({size:,} bytes, listed: {listed_size})")
                total_downloaded += 1
            else:
                print(f"    -> FAILED")
                manifest_entries.append(f"[FAILED] {year}/{month:02d}/{decoded_filename} (listed: {listed_size})")
                total_failed += 1

            # Be polite - small delay between downloads
            time.sleep(1)

    # Write manifest
    print(f"\n{'='*70}")
    print("Writing manifest...")
    with open(MANIFEST_FILE, 'w', encoding='utf-8') as f:
        f.write("MICT.GOUV.HT - Older Uploads Document Manifest\n")
        f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Base URL: {BASE_URL}\n")
        f.write(f"{'='*70}\n\n")
        f.write(f"SUMMARY:\n")
        f.write(f"  Downloaded: {total_downloaded}\n")
        f.write(f"  Skipped (already existed): {total_skipped}\n")
        f.write(f"  Failed: {total_failed}\n")
        f.write(f"  Total documents found: {total_downloaded + total_skipped + total_failed}\n\n")
        f.write(f"{'='*70}\n")
        f.write("DOCUMENTS:\n\n")
        for entry in manifest_entries:
            f.write(f"  {entry}\n")
        f.write(f"\n{'='*70}\n")

    print(f"\nDONE!")
    print(f"  Downloaded: {total_downloaded}")
    print(f"  Skipped: {total_skipped}")
    print(f"  Failed: {total_failed}")
    print(f"  Manifest: {MANIFEST_FILE}")


if __name__ == '__main__':
    main()
