#!/usr/bin/env python3 """ Download documents from mict.gouv.ht WordPress uploads directory listings. Parses locally-saved index HTML files, extracts document URLs, downloads them. """ import os import re import sys import time import urllib.parse import urllib.request import ssl from html.parser import HTMLParser BASE_URL = "https://mict.gouv.ht/wp-content/uploads/" DUMP_DIR = r"C:\Users\Squir\Desktop\HAITI\DUMP\MICT-GOUV\uploads" INDEX_DIR = r"C:\Users\Squir\Desktop\HAITI\DUMP\MICT-GOUV" MANIFEST_FILE = os.path.join(INDEX_DIR, "older-uploads-manifest.txt") USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" # Document extensions to download DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.csv', '.txt', '.rtf'} # WordPress thumbnail pattern: -NNNxNNN or -NNNxNNN_c before extension THUMB_PATTERN = re.compile(r'-\d+x\d+(_c)?\.') # Index files to process (year, month) -> index filename INDEX_FILES = [ (2016, 3, "uploads-2016-03-index.html"), (2016, 4, "uploads-2016-04-index.html"), (2019, 1, "uploads-2019-01-index.html"), (2019, 2, "uploads-2019-02-index.html"), (2020, 5, "uploads-2020-05-index.html"), (2020, 6, "uploads-2020-06-index.html"), (2024, 9, "uploads-2024-09-index.html"), (2025, 1, "uploads-2025-01-index.html"), (2025, 9, "uploads-2025-09-index.html"), ] class LinkExtractor(HTMLParser): """Extract href links from HTML.""" def __init__(self): super().__init__() self.links = [] def handle_starttag(self, tag, attrs): if tag == 'a': for attr_name, attr_value in attrs: if attr_name == 'href' and attr_value: self.links.append(attr_value) def extract_links_from_index(index_path): """Parse an Apache directory listing HTML and return all href links.""" with open(index_path, 'r', encoding='utf-8', errors='replace') as f: html = f.read() parser = LinkExtractor() parser.feed(html) return parser.links def is_document(filename): """Check if filename is a document (not a thumbnail image).""" lower = filename.lower() # Check extension _, ext = os.path.splitext(lower) if ext not in DOC_EXTENSIONS: return False # Skip WordPress thumbnails if THUMB_PATTERN.search(filename): return False return True def extract_size_from_index(index_path, filename): """Try to extract the file size from the directory listing HTML.""" with open(index_path, 'r', encoding='utf-8', errors='replace') as f: html = f.read() # Look for the filename in the HTML and extract the size escaped = re.escape(filename) pattern = rf'href="{escaped}".*?align="right">\s*([\d.]+[KMG]?)\s*' match = re.search(pattern, html) if match: return match.group(1).strip() return "unknown" def download_file(url, dest_path, retries=3): """Download a file with retries and proper headers.""" # Create SSL context that doesn't verify (some gov sites have cert issues) ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE headers = { 'User-Agent': USER_AGENT, 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://mict.gouv.ht/', } req = urllib.request.Request(url, headers=headers) for attempt in range(retries): try: response = urllib.request.urlopen(req, context=ctx, timeout=60) data = response.read() os.makedirs(os.path.dirname(dest_path), exist_ok=True) with open(dest_path, 'wb') as f: f.write(data) size = len(data) return True, size except Exception as e: print(f" Attempt {attempt+1}/{retries} failed: {e}") if attempt < retries - 1: time.sleep(2 * (attempt + 1)) return False, 0 def main(): manifest_entries = [] total_downloaded = 0 total_failed = 0 total_skipped = 0 print("=" * 70) print("MICT.GOUV.HT - Older Uploads Document Downloader") print("=" * 70) for year, month, index_filename in INDEX_FILES: index_path = os.path.join(INDEX_DIR, index_filename) if not os.path.exists(index_path): print(f"\n[SKIP] Index file not found: {index_filename}") continue print(f"\n{'='*60}") print(f"Processing: {year}/{month:02d} ({index_filename})") print(f"{'='*60}") # Extract all links links = extract_links_from_index(index_path) # Filter to documents only doc_links = [link for link in links if is_document(link)] if not doc_links: print(f" No documents found in {index_filename}") continue print(f" Found {len(doc_links)} document(s)") # Download directory dest_dir = os.path.join(DUMP_DIR, str(year), f"{month:02d}") os.makedirs(dest_dir, exist_ok=True) for filename in doc_links: # Decode URL-encoded filename for local storage decoded_filename = urllib.parse.unquote(filename) # But use original for URL url = BASE_URL + f"{year}/{month:02d}/{filename}" dest_path = os.path.join(dest_dir, decoded_filename) # Get listed size from index listed_size = extract_size_from_index(index_path, filename) # Skip if already downloaded if os.path.exists(dest_path) and os.path.getsize(dest_path) > 0: existing_size = os.path.getsize(dest_path) print(f" [SKIP] Already exists: {decoded_filename} ({existing_size} bytes)") manifest_entries.append(f"[EXISTS] {year}/{month:02d}/{decoded_filename} (listed: {listed_size})") total_skipped += 1 continue print(f" [DOWNLOAD] {decoded_filename} (listed: {listed_size}) ...") success, size = download_file(url, dest_path) if success: print(f" -> OK ({size:,} bytes)") manifest_entries.append(f"[OK] {year}/{month:02d}/{decoded_filename} ({size:,} bytes, listed: {listed_size})") total_downloaded += 1 else: print(f" -> FAILED") manifest_entries.append(f"[FAILED] {year}/{month:02d}/{decoded_filename} (listed: {listed_size})") total_failed += 1 # Be polite - small delay between downloads time.sleep(1) # Write manifest print(f"\n{'='*70}") print("Writing manifest...") with open(MANIFEST_FILE, 'w', encoding='utf-8') as f: f.write("MICT.GOUV.HT - Older Uploads Document Manifest\n") f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"Base URL: {BASE_URL}\n") f.write(f"{'='*70}\n\n") f.write(f"SUMMARY:\n") f.write(f" Downloaded: {total_downloaded}\n") f.write(f" Skipped (already existed): {total_skipped}\n") f.write(f" Failed: {total_failed}\n") f.write(f" Total documents found: {total_downloaded + total_skipped + total_failed}\n\n") f.write(f"{'='*70}\n") f.write("DOCUMENTS:\n\n") for entry in manifest_entries: f.write(f" {entry}\n") f.write(f"\n{'='*70}\n") print(f"\nDONE!") print(f" Downloaded: {total_downloaded}") print(f" Skipped: {total_skipped}") print(f" Failed: {total_failed}") print(f" Manifest: {MANIFEST_FILE}") if __name__ == '__main__': main()