package engine

import (
	"strings"
	"sync"
	"time"
)

// acceptedContentTypes defines content types that are valid for credential
// exposure checks. Defined at package level for performance.
var acceptedContentTypes = []string{
	"text/plain",
	"application/json",
	"application/octet-stream",
	"application/zip",
	"application/sql",
	"application/xml",
	"text/xml",
	"application/x-sql",
}

// htmlIndicators defines HTML tags that indicate an HTML response.
// Defined at package level for performance.
var htmlIndicators = []string{
	"<!doctype",
	"<html",
	"<head>",
	"<body",
	"<meta",
	"<script",
}

// isValidContentType checks if the HTTP response Content-Type header indicates
// a valid response (not an HTML error page). Returns true for text/plain,
// application/json, application/octet-stream, and other non-HTML types.
// Returns false for text/html (which typically indicates error pages).
//
// Empty content-type is allowed since some servers omit the header.
func isValidContentType(path, contentType string) bool {
	// Empty content-type is valid (some servers omit it)
	if contentType == "" {
		return true
	}

	// Normalize to lowercase and strip parameters (e.g., "; charset=utf-8")
	ct := strings.ToLower(contentType)
	if before, _, found := strings.Cut(ct, ";"); found {
		ct = strings.TrimSpace(before)
	}

	// Reject HTML responses (typically error pages)
	if ct == "text/html" {
		return false
	}

	// Accept common text and binary formats
	for _, accepted := range acceptedContentTypes {
		if ct == accepted {
			return true
		}
	}

	// Reject everything else (images, videos, audio, etc.)
	return false
}

// containsHTMLIndicators checks if the response body contains HTML markup,
// which typically indicates an error page rather than exposed credentials.
// Checks for common HTML tags like <!doctype>, <html>, <head>, <body>,
// <meta>, and <script> (case-insensitive).
func containsHTMLIndicators(body string) bool {
	bodyLower := strings.ToLower(body)

	for _, indicator := range htmlIndicators {
		if strings.Contains(bodyLower, indicator) {
			return true
		}
	}

	return false
}

// matchPatternsContextAware performs context-aware pattern matching to reduce
// false positives. Returns a list of exposure types detected.
//
// For .env files: Requires 3+ lines matching KEY=VALUE format (all-caps key).
// This eliminates false positives from HTML forms or single generic words.
//
// For git config: Requires presence of [core], [remote], [branch], or [user]
// sections, not just the word "core" or "remote" in prose.
func matchPatternsContextAware(body string) []string {
	var matches []string

	// Check for .env exposure: require 3+ KEY=VALUE lines
	envMatches := envKeyValuePattern.FindAllString(body, -1)
	if len(envMatches) >= 3 {
		matches = append(matches, ".env-exposed")
	}

	// Check for git config exposure: require git-specific INI sections
	if gitConfigPattern.MatchString(body) {
		matches = append(matches, ".git-exposed")
	}

	return matches
}

// baseline holds soft 404 detection baseline data for a domain.
type baseline struct {
	contentLength int
	fetchedAt     time.Time
}

// baselineCache stores soft 404 baselines (homepage content) for domains.
// Thread-safe via RWMutex.
type baselineCache struct {
	mu    sync.RWMutex
	cache map[string]baseline
}

// newBaselineCache creates a new baseline cache.
func newBaselineCache() *baselineCache {
	return &baselineCache{
		cache: make(map[string]baseline),
	}
}

// get retrieves a baseline for the given domain.
// Returns (baseline, true) if found, (zero-value, false) if not.
func (bc *baselineCache) get(domain string) (baseline, bool) {
	bc.mu.RLock()
	defer bc.mu.RUnlock()
	b, ok := bc.cache[domain]
	return b, ok
}

// set stores a baseline for the given domain.
func (bc *baselineCache) set(domain string, b baseline) {
	bc.mu.Lock()
	defer bc.mu.Unlock()
	bc.cache[domain] = b
}

// isSoft404 determines if a response is likely a soft 404 based on content length
// comparison to the baseline homepage. Uses 5% threshold per research.
//
// Returns true if bodyLen is within 5% of baseline (soft 404), false otherwise.
// Fails open (returns false) when no baseline is available (contentLength == 0).
func isSoft404(bodyLen int, base baseline) bool {
	// No baseline available - fail open (assume valid response)
	if base.contentLength == 0 {
		return false
	}

	// Calculate absolute difference
	diff := bodyLen - base.contentLength
	if diff < 0 {
		diff = -diff
	}

	// Check if within 5% threshold
	threshold := base.contentLength * 5 / 100
	return diff <= threshold
}

// matchSpecificPatterns checks for specific credential key patterns using
// context-aware matching. Only detects patterns that appear in KEY=VALUE format.
// Skips generic words (password, secret, token) and git indicators.
func matchSpecificPatterns(body string) []string {
	var matched []string
	bodyUpper := strings.ToUpper(body)

	// Generic words that should be skipped (handled by matchPatternsContextAware)
	genericWords := map[string]bool{
		"PASSWORD":                true,
		"PASSWD":                  true,
		"SECRET":                  true,
		"CREDENTIAL":              true,
		"TOKEN":                   true,
		"[CORE]":                  true,
		"REPOSITORYFORMATVERSION": true,
	}

	for _, pattern := range CredentialPatterns {
		patternUpper := strings.ToUpper(pattern)

		// Skip generic words and git indicators
		if genericWords[patternUpper] {
			continue
		}

		// Check if pattern appears in KEY=VALUE format
		if strings.Contains(bodyUpper, patternUpper+"=") {
			matched = append(matched, pattern)
		}
	}

	return matched
}
