Files
majordomo/health/health.go
T
steve dcd004289f feat: foundations — canonical types, Parse grammar, env DSNs, health, chains
Phase 1 of the majordomo build:
- llm/ canonical contract (messages, parts, tools, capabilities, streaming,
  Model/Provider, error classification)
- health/ clock-injected tracker (threshold bench, exponential capped
  cooldown, reset-on-success)
- root Registry + Parse (verbatim model ids, inline recursive alias
  expansion with cycle detection, chain dedup), LLM_* env-DSN providers
  (go-llm parity: lazy fallback + eager LoadEnv), health-aware chain
  executor behind the Model interface
- provider/fake scriptable test provider; hermetic test suite incl. the
  trailing-thinking chain and foreman:// env loading
- ADRs 0001-0008, CLAUDE.md, README (honest matrix), CI workflow,
  docs/phase-1-design.md

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 12:35:34 +02:00

164 lines
4.7 KiB
Go

// Package health tracks per-target model health for failover decisions.
//
// Why: a failover chain must skip targets that are repeatedly failing
// ("backed off") and re-admit them after a cooldown, without any persistent
// state or background goroutines. The tracker is in-memory, process-local,
// thread-safe, and clock-injected so backoff is unit-testable.
//
// Semantics (see ADR-0006):
// - One transient failure increments a consecutive-failure count.
// - Reaching the failure threshold (default 2) backs the target off until
// now + cooldown. Cooldown grows exponentially per consecutive backoff
// (default base 5s, x2 each time, capped at 5m).
// - Any success fully resets the target: failure count and backoff
// history both clear.
package health
import (
"sync"
"time"
)
// Default configuration values.
const (
DefaultFailureThreshold = 2
DefaultBaseCooldown = 5 * time.Second
DefaultMaxCooldown = 5 * time.Minute
DefaultMultiplier = 2.0
)
// Clock supplies the current time; injected for tests.
type Clock func() time.Time
// Config tunes the tracker. Zero values select the defaults above.
type Config struct {
// FailureThreshold is the number of consecutive transient failures that
// triggers a backoff.
FailureThreshold int
// BaseCooldown is the first backoff duration.
BaseCooldown time.Duration
// MaxCooldown caps the exponential growth.
MaxCooldown time.Duration
// Multiplier scales the cooldown per consecutive backoff.
Multiplier float64
// Clock supplies the current time (defaults to time.Now).
Clock Clock
}
func (c Config) withDefaults() Config {
if c.FailureThreshold <= 0 {
c.FailureThreshold = DefaultFailureThreshold
}
if c.BaseCooldown <= 0 {
c.BaseCooldown = DefaultBaseCooldown
}
if c.MaxCooldown <= 0 {
c.MaxCooldown = DefaultMaxCooldown
}
if c.Multiplier <= 1 {
c.Multiplier = DefaultMultiplier
}
if c.Clock == nil {
c.Clock = time.Now
}
return c
}
// Tracker records per-key health. Keys are opaque; majordomo uses
// "provider/model-id".
//
// Tracker is an interface-free concrete type on purpose: consumers that want
// persistence can wrap it behind their own interface; majordomo itself stays
// in-memory (ADR-0006).
type Tracker struct {
mu sync.Mutex
cfg Config
entries map[string]*entry
}
type entry struct {
// consecutiveFailures counts transient failures since the last success
// or backoff trigger.
consecutiveFailures int
// backoffs counts consecutive backoff rounds since the last success;
// it drives the exponential cooldown.
backoffs int
// until is the moment the current backoff expires (zero = not backed off).
until time.Time
}
// NewTracker creates a tracker with the given configuration.
func NewTracker(cfg Config) *Tracker {
return &Tracker{cfg: cfg.withDefaults(), entries: make(map[string]*entry)}
}
// Available reports whether the key is currently usable (not backed off).
func (t *Tracker) Available(key string) bool {
t.mu.Lock()
defer t.mu.Unlock()
e, ok := t.entries[key]
if !ok {
return true
}
return !t.cfg.Clock().Before(e.until)
}
// ReportSuccess resets the key's failure count and backoff history.
func (t *Tracker) ReportSuccess(key string) {
t.mu.Lock()
defer t.mu.Unlock()
delete(t.entries, key)
}
// ReportFailure records a transient failure. When the consecutive-failure
// count reaches the threshold the key is backed off and the method reports
// true; the count then resets so re-admission requires a fresh run of
// failures to trigger the next (longer) backoff.
func (t *Tracker) ReportFailure(key string) (backedOff bool) {
t.mu.Lock()
defer t.mu.Unlock()
e, ok := t.entries[key]
if !ok {
e = &entry{}
t.entries[key] = e
}
e.consecutiveFailures++
if e.consecutiveFailures < t.cfg.FailureThreshold {
return false
}
cooldown := t.cooldownFor(e.backoffs)
e.until = t.cfg.Clock().Add(cooldown)
e.backoffs++
e.consecutiveFailures = 0
return true
}
// BackedOffUntil returns the end of the key's current backoff window, or the
// zero time when the key is not backed off. Useful for diagnostics and error
// messages.
func (t *Tracker) BackedOffUntil(key string) time.Time {
t.mu.Lock()
defer t.mu.Unlock()
e, ok := t.entries[key]
if !ok || !t.cfg.Clock().Before(e.until) {
return time.Time{}
}
return e.until
}
// cooldownFor computes the cooldown for the n-th consecutive backoff
// (0-based): base * multiplier^n, capped at MaxCooldown.
func (t *Tracker) cooldownFor(n int) time.Duration {
d := float64(t.cfg.BaseCooldown)
for range n {
d *= t.cfg.Multiplier
if time.Duration(d) >= t.cfg.MaxCooldown {
return t.cfg.MaxCooldown
}
}
if time.Duration(d) > t.cfg.MaxCooldown {
return t.cfg.MaxCooldown
}
return time.Duration(d)
}