// Package health tracks per-target model health for failover decisions. // // Why: a failover chain must skip targets that are repeatedly failing // ("backed off") and re-admit them after a cooldown, without any persistent // state or background goroutines. The tracker is in-memory, process-local, // thread-safe, and clock-injected so backoff is unit-testable. // // Semantics (see ADR-0006): // - One transient failure increments a consecutive-failure count. // - Reaching the failure threshold (default 2) backs the target off until // now + cooldown. Cooldown grows exponentially per consecutive backoff // (default base 5s, x2 each time, capped at 5m). // - Any success fully resets the target: failure count and backoff // history both clear. package health import ( "sync" "time" ) // Default configuration values. const ( DefaultFailureThreshold = 2 DefaultBaseCooldown = 5 * time.Second DefaultMaxCooldown = 5 * time.Minute DefaultMultiplier = 2.0 ) // Clock supplies the current time; injected for tests. type Clock func() time.Time // Config tunes the tracker. Zero values select the defaults above. type Config struct { // FailureThreshold is the number of consecutive transient failures that // triggers a backoff. FailureThreshold int // BaseCooldown is the first backoff duration. BaseCooldown time.Duration // MaxCooldown caps the exponential growth. MaxCooldown time.Duration // Multiplier scales the cooldown per consecutive backoff. Multiplier float64 // Clock supplies the current time (defaults to time.Now). Clock Clock } func (c Config) withDefaults() Config { if c.FailureThreshold <= 0 { c.FailureThreshold = DefaultFailureThreshold } if c.BaseCooldown <= 0 { c.BaseCooldown = DefaultBaseCooldown } if c.MaxCooldown <= 0 { c.MaxCooldown = DefaultMaxCooldown } if c.Multiplier <= 1 { c.Multiplier = DefaultMultiplier } if c.Clock == nil { c.Clock = time.Now } return c } // Tracker records per-key health. Keys are opaque; majordomo uses // "provider/model-id". // // Tracker is an interface-free concrete type on purpose: consumers that want // persistence can wrap it behind their own interface; majordomo itself stays // in-memory (ADR-0006). type Tracker struct { mu sync.Mutex cfg Config entries map[string]*entry } type entry struct { // consecutiveFailures counts transient failures since the last success // or backoff trigger. consecutiveFailures int // backoffs counts consecutive backoff rounds since the last success; // it drives the exponential cooldown. backoffs int // until is the moment the current backoff expires (zero = not backed off). until time.Time } // NewTracker creates a tracker with the given configuration. func NewTracker(cfg Config) *Tracker { return &Tracker{cfg: cfg.withDefaults(), entries: make(map[string]*entry)} } // Available reports whether the key is currently usable (not backed off). func (t *Tracker) Available(key string) bool { t.mu.Lock() defer t.mu.Unlock() e, ok := t.entries[key] if !ok { return true } return !t.cfg.Clock().Before(e.until) } // ReportSuccess resets the key's failure count and backoff history. func (t *Tracker) ReportSuccess(key string) { t.mu.Lock() defer t.mu.Unlock() delete(t.entries, key) } // ReportFailure records a transient failure. When the consecutive-failure // count reaches the threshold the key is backed off and the method reports // true; the count then resets so re-admission requires a fresh run of // failures to trigger the next (longer) backoff. func (t *Tracker) ReportFailure(key string) (backedOff bool) { t.mu.Lock() defer t.mu.Unlock() e, ok := t.entries[key] if !ok { e = &entry{} t.entries[key] = e } e.consecutiveFailures++ if e.consecutiveFailures < t.cfg.FailureThreshold { return false } cooldown := t.cooldownFor(e.backoffs) e.until = t.cfg.Clock().Add(cooldown) e.backoffs++ e.consecutiveFailures = 0 return true } // BackedOffUntil returns the end of the key's current backoff window, or the // zero time when the key is not backed off. Useful for diagnostics and error // messages. func (t *Tracker) BackedOffUntil(key string) time.Time { t.mu.Lock() defer t.mu.Unlock() e, ok := t.entries[key] if !ok || !t.cfg.Clock().Before(e.until) { return time.Time{} } return e.until } // cooldownFor computes the cooldown for the n-th consecutive backoff // (0-based): base * multiplier^n, capped at MaxCooldown. func (t *Tracker) cooldownFor(n int) time.Duration { d := float64(t.cfg.BaseCooldown) for range n { d *= t.cfg.Multiplier if time.Duration(d) >= t.cfg.MaxCooldown { return t.cfg.MaxCooldown } } if time.Duration(d) > t.cfg.MaxCooldown { return t.cfg.MaxCooldown } return time.Duration(d) }