majordomo/health/health.go

// Package health tracks per-target model health for failover decisions.
//
// Why: a failover chain must skip targets that are repeatedly failing
// ("backed off") and re-admit them after a cooldown, without any persistent
// state or background goroutines. The tracker is in-memory, process-local,
// thread-safe, and clock-injected so backoff is unit-testable.
//
// Semantics (see ADR-0006):
//   - One transient failure increments a consecutive-failure count.
//   - Reaching the failure threshold (default 2) backs the target off until
//     now + cooldown. Cooldown grows exponentially per consecutive backoff
//     (default base 5s, x2 each time, capped at 5m).
//   - Any success fully resets the target: failure count and backoff
//     history both clear.
package health

import (
	"slices"
	"strings"
	"sync"
	"time"
)

// Default configuration values.
const (
	DefaultFailureThreshold = 2
	DefaultBaseCooldown     = 5 * time.Second
	DefaultMaxCooldown      = 5 * time.Minute
	DefaultMultiplier       = 2.0
)

// Clock supplies the current time; injected for tests.
type Clock func() time.Time

// Config tunes the tracker. Zero values select the defaults above.
type Config struct {
	// FailureThreshold is the number of consecutive transient failures that
	// triggers a backoff.
	FailureThreshold int
	// BaseCooldown is the first backoff duration.
	BaseCooldown time.Duration
	// MaxCooldown caps the exponential growth.
	MaxCooldown time.Duration
	// Multiplier scales the cooldown per consecutive backoff.
	Multiplier float64
	// Clock supplies the current time (defaults to time.Now).
	Clock Clock
}

func (c Config) withDefaults() Config {
	if c.FailureThreshold <= 0 {
		c.FailureThreshold = DefaultFailureThreshold
	}
	if c.BaseCooldown <= 0 {
		c.BaseCooldown = DefaultBaseCooldown
	}
	if c.MaxCooldown <= 0 {
		c.MaxCooldown = DefaultMaxCooldown
	}
	if c.Multiplier <= 1 {
		c.Multiplier = DefaultMultiplier
	}
	if c.Clock == nil {
		c.Clock = time.Now
	}
	return c
}

// Tracker records per-key health. Keys are opaque; majordomo uses
// "provider/model-id".
//
// Tracker is an interface-free concrete type on purpose: consumers that want
// persistence can wrap it behind their own interface; majordomo itself stays
// in-memory (ADR-0006).
type Tracker struct {
	mu      sync.Mutex
	cfg     Config
	entries map[string]*entry
}

type entry struct {
	// consecutiveFailures counts transient failures since the last success
	// or backoff trigger.
	consecutiveFailures int
	// backoffs counts consecutive backoff rounds since the last success;
	// it drives the exponential cooldown.
	backoffs int
	// until is the moment the current backoff expires (zero = not backed off).
	until time.Time
}

// NewTracker creates a tracker with the given configuration.
func NewTracker(cfg Config) *Tracker {
	return &Tracker{cfg: cfg.withDefaults(), entries: make(map[string]*entry)}
}

// Available reports whether the key is currently usable (not backed off).
func (t *Tracker) Available(key string) bool {
	t.mu.Lock()
	defer t.mu.Unlock()
	e, ok := t.entries[key]
	if !ok {
		return true
	}
	return !t.cfg.Clock().Before(e.until)
}

// ReportSuccess resets the key's failure count and backoff history.
func (t *Tracker) ReportSuccess(key string) {
	t.mu.Lock()
	defer t.mu.Unlock()
	delete(t.entries, key)
}

// ReportFailure records a transient failure. When the consecutive-failure
// count reaches the threshold the key is backed off and the method reports
// true; the count then resets so re-admission requires a fresh run of
// failures to trigger the next (longer) backoff.
func (t *Tracker) ReportFailure(key string) (backedOff bool) {
	t.mu.Lock()
	defer t.mu.Unlock()
	e, ok := t.entries[key]
	if !ok {
		e = &entry{}
		t.entries[key] = e
	}
	e.consecutiveFailures++
	if e.consecutiveFailures < t.cfg.FailureThreshold {
		return false
	}
	cooldown := t.cooldownFor(e.backoffs)
	e.until = t.cfg.Clock().Add(cooldown)
	e.backoffs++
	e.consecutiveFailures = 0
	return true
}

// Bench manually benches a key until the given time, regardless of its
// failure history (ops surfaces: ".failover bench"). A zero time unbenches.
func (t *Tracker) Bench(key string, until time.Time) {
	t.mu.Lock()
	defer t.mu.Unlock()
	e, ok := t.entries[key]
	if !ok {
		e = &entry{}
		t.entries[key] = e
	}
	e.until = until
}

// Unbench clears a key entirely: bench window, failure count, and backoff
// history.
func (t *Tracker) Unbench(key string) {
	t.mu.Lock()
	defer t.mu.Unlock()
	delete(t.entries, key)
}

// Status describes one tracked key (diagnostics).
type Status struct {
	Key string
	// Until is the end of the current bench window (zero = not benched).
	Until time.Time
	// ConsecutiveFailures since the last success or bench trigger.
	ConsecutiveFailures int
	// Benches is the consecutive backoff count driving the exponent.
	Benches int
}

// Snapshot returns the status of every currently-benched key, sorted by
// key, evaluated at the tracker's clock.
func (t *Tracker) Snapshot() []Status {
	t.mu.Lock()
	defer t.mu.Unlock()
	now := t.cfg.Clock()
	out := make([]Status, 0, len(t.entries))
	for key, e := range t.entries {
		if now.Before(e.until) {
			out = append(out, Status{
				Key: key, Until: e.until,
				ConsecutiveFailures: e.consecutiveFailures, Benches: e.backoffs,
			})
		}
	}
	slices.SortFunc(out, func(a, b Status) int { return strings.Compare(a.Key, b.Key) })
	return out
}

// BackedOffUntil returns the end of the key's current backoff window, or the
// zero time when the key is not backed off. Useful for diagnostics and error
// messages.
func (t *Tracker) BackedOffUntil(key string) time.Time {
	t.mu.Lock()
	defer t.mu.Unlock()
	e, ok := t.entries[key]
	if !ok || !t.cfg.Clock().Before(e.until) {
		return time.Time{}
	}
	return e.until
}

// cooldownFor computes the cooldown for the n-th consecutive backoff
// (0-based): base * multiplier^n, capped at MaxCooldown.
func (t *Tracker) cooldownFor(n int) time.Duration {
	d := float64(t.cfg.BaseCooldown)
	for range n {
		d *= t.cfg.Multiplier
		if time.Duration(d) >= t.cfg.MaxCooldown {
			return t.cfg.MaxCooldown
		}
	}
	if time.Duration(d) > t.cfg.MaxCooldown {
		return t.cfg.MaxCooldown
	}
	return time.Duration(d)
}