#!/bin/bash
# Deep recursive enumeration of repodatos.atdt.gob.mx
# Crawls JSON directory listings, catalogs all files with sizes
# Usage: bash repodatos-enumerate.sh [output_dir]

OUT_DIR="${1:-/opt/repodatos}"
mkdir -p "$OUT_DIR"

BASE_URL="https://repodatos.atdt.gob.mx"
TREE_FILE="$OUT_DIR/full-tree.txt"
SUMMARY_FILE="$OUT_DIR/summary.txt"
RAW_DIR="$OUT_DIR/raw-listings"
mkdir -p "$RAW_DIR"

> "$TREE_FILE"

total_files=0
total_size=0
total_dirs=0

crawl_dir() {
  local url="$1"
  local prefix="$2"
  local depth="$3"

  if [ "$depth" -gt 8 ]; then
    echo "MAX_DEPTH|$url" >> "$TREE_FILE"
    return
  fi

  local json
  json=$(curl -skL --max-time 15 "$url" 2>/dev/null)
  if [ -z "$json" ]; then
    echo "FETCH_FAIL|$url" >> "$TREE_FILE"
    return
  fi

  # Save raw listing
  local safe_name
  safe_name=$(echo "$prefix" | tr '/' '_')
  echo "$json" > "$RAW_DIR/${safe_name}.json"

  # Parse JSON array of objects with name, type, mtime, size
  # Using python3 for reliable JSON parsing
  local entries
  entries=$(echo "$json" | python3 -c "
import json, sys
try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
        for item in data:
            name = item.get('name','')
            ftype = item.get('type','')
            size = item.get('size', 0)
            mtime = item.get('mtime','')
            print(f'{ftype}|{name}|{size}|{mtime}')
except:
    pass
" 2>/dev/null)

  if [ -z "$entries" ]; then
    echo "PARSE_FAIL|$url" >> "$TREE_FILE"
    return
  fi

  while IFS='|' read -r ftype fname fsize fmtime; do
    [ -z "$fname" ] && continue
    local full_path="${prefix}${fname}"

    if [ "$ftype" = "directory" ]; then
      total_dirs=$((total_dirs + 1))
      echo "DIR|${full_path}/|$fmtime" >> "$TREE_FILE"
      echo "[DIR ] ${full_path}/" >&2
      crawl_dir "${url}${fname}/" "${full_path}/" $((depth + 1))
    else
      total_files=$((total_files + 1))
      total_size=$((total_size + fsize))
      echo "FILE|${full_path}|${fsize}|$fmtime" >> "$TREE_FILE"
    fi
  done <<< "$entries"
}

echo "=== Starting recursive enumeration of $BASE_URL ==="
echo "=== $(date) ==="
crawl_dir "$BASE_URL/" "/" 0

# Generate summary
echo "=== Enumeration Complete ===" > "$SUMMARY_FILE"
echo "Date: $(date)" >> "$SUMMARY_FILE"
echo "Total directories: $total_dirs" >> "$SUMMARY_FILE"
echo "Total files: $total_files" >> "$SUMMARY_FILE"
echo "Total size (bytes): $total_size" >> "$SUMMARY_FILE"

# Human-readable size
if [ "$total_size" -gt 1073741824 ]; then
  hr_size=$(echo "scale=2; $total_size / 1073741824" | bc)
  echo "Total size: ${hr_size} GB" >> "$SUMMARY_FILE"
elif [ "$total_size" -gt 1048576 ]; then
  hr_size=$(echo "scale=2; $total_size / 1048576" | bc)
  echo "Total size: ${hr_size} MB" >> "$SUMMARY_FILE"
fi

echo "" >> "$SUMMARY_FILE"
echo "=== Top 20 largest files ===" >> "$SUMMARY_FILE"
grep "^FILE|" "$TREE_FILE" | sort -t'|' -k3 -rn | head -20 >> "$SUMMARY_FILE"

echo "" >> "$SUMMARY_FILE"
echo "=== Directory count by top-level ===" >> "$SUMMARY_FILE"
grep "^DIR|" "$TREE_FILE" | cut -d'|' -f2 | cut -d'/' -f2 | sort | uniq -c | sort -rn >> "$SUMMARY_FILE"

echo "" >> "$SUMMARY_FILE"
echo "=== File count by top-level ===" >> "$SUMMARY_FILE"
grep "^FILE|" "$TREE_FILE" | cut -d'|' -f2 | cut -d'/' -f2 | sort | uniq -c | sort -rn >> "$SUMMARY_FILE"

echo ""
echo "=== DONE ==="
echo "Tree: $TREE_FILE"
echo "Summary: $SUMMARY_FILE"
echo "Raw listings: $RAW_DIR/"
cat "$SUMMARY_FILE"
