feat: foundations — canonical types, Parse grammar, env DSNs, health, chains
Phase 1 of the majordomo build: - llm/ canonical contract (messages, parts, tools, capabilities, streaming, Model/Provider, error classification) - health/ clock-injected tracker (threshold bench, exponential capped cooldown, reset-on-success) - root Registry + Parse (verbatim model ids, inline recursive alias expansion with cycle detection, chain dedup), LLM_* env-DSN providers (go-llm parity: lazy fallback + eager LoadEnv), health-aware chain executor behind the Model interface - provider/fake scriptable test provider; hermetic test suite incl. the trailing-thinking chain and foreman:// env loading - ADRs 0001-0008, CLAUDE.md, README (honest matrix), CI workflow, docs/phase-1-design.md Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,163 @@
|
||||
// Package health tracks per-target model health for failover decisions.
|
||||
//
|
||||
// Why: a failover chain must skip targets that are repeatedly failing
|
||||
// ("backed off") and re-admit them after a cooldown, without any persistent
|
||||
// state or background goroutines. The tracker is in-memory, process-local,
|
||||
// thread-safe, and clock-injected so backoff is unit-testable.
|
||||
//
|
||||
// Semantics (see ADR-0006):
|
||||
// - One transient failure increments a consecutive-failure count.
|
||||
// - Reaching the failure threshold (default 2) backs the target off until
|
||||
// now + cooldown. Cooldown grows exponentially per consecutive backoff
|
||||
// (default base 5s, x2 each time, capped at 5m).
|
||||
// - Any success fully resets the target: failure count and backoff
|
||||
// history both clear.
|
||||
package health
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Default configuration values.
|
||||
const (
|
||||
DefaultFailureThreshold = 2
|
||||
DefaultBaseCooldown = 5 * time.Second
|
||||
DefaultMaxCooldown = 5 * time.Minute
|
||||
DefaultMultiplier = 2.0
|
||||
)
|
||||
|
||||
// Clock supplies the current time; injected for tests.
|
||||
type Clock func() time.Time
|
||||
|
||||
// Config tunes the tracker. Zero values select the defaults above.
|
||||
type Config struct {
|
||||
// FailureThreshold is the number of consecutive transient failures that
|
||||
// triggers a backoff.
|
||||
FailureThreshold int
|
||||
// BaseCooldown is the first backoff duration.
|
||||
BaseCooldown time.Duration
|
||||
// MaxCooldown caps the exponential growth.
|
||||
MaxCooldown time.Duration
|
||||
// Multiplier scales the cooldown per consecutive backoff.
|
||||
Multiplier float64
|
||||
// Clock supplies the current time (defaults to time.Now).
|
||||
Clock Clock
|
||||
}
|
||||
|
||||
func (c Config) withDefaults() Config {
|
||||
if c.FailureThreshold <= 0 {
|
||||
c.FailureThreshold = DefaultFailureThreshold
|
||||
}
|
||||
if c.BaseCooldown <= 0 {
|
||||
c.BaseCooldown = DefaultBaseCooldown
|
||||
}
|
||||
if c.MaxCooldown <= 0 {
|
||||
c.MaxCooldown = DefaultMaxCooldown
|
||||
}
|
||||
if c.Multiplier <= 1 {
|
||||
c.Multiplier = DefaultMultiplier
|
||||
}
|
||||
if c.Clock == nil {
|
||||
c.Clock = time.Now
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Tracker records per-key health. Keys are opaque; majordomo uses
|
||||
// "provider/model-id".
|
||||
//
|
||||
// Tracker is an interface-free concrete type on purpose: consumers that want
|
||||
// persistence can wrap it behind their own interface; majordomo itself stays
|
||||
// in-memory (ADR-0006).
|
||||
type Tracker struct {
|
||||
mu sync.Mutex
|
||||
cfg Config
|
||||
entries map[string]*entry
|
||||
}
|
||||
|
||||
type entry struct {
|
||||
// consecutiveFailures counts transient failures since the last success
|
||||
// or backoff trigger.
|
||||
consecutiveFailures int
|
||||
// backoffs counts consecutive backoff rounds since the last success;
|
||||
// it drives the exponential cooldown.
|
||||
backoffs int
|
||||
// until is the moment the current backoff expires (zero = not backed off).
|
||||
until time.Time
|
||||
}
|
||||
|
||||
// NewTracker creates a tracker with the given configuration.
|
||||
func NewTracker(cfg Config) *Tracker {
|
||||
return &Tracker{cfg: cfg.withDefaults(), entries: make(map[string]*entry)}
|
||||
}
|
||||
|
||||
// Available reports whether the key is currently usable (not backed off).
|
||||
func (t *Tracker) Available(key string) bool {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
e, ok := t.entries[key]
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
return !t.cfg.Clock().Before(e.until)
|
||||
}
|
||||
|
||||
// ReportSuccess resets the key's failure count and backoff history.
|
||||
func (t *Tracker) ReportSuccess(key string) {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
delete(t.entries, key)
|
||||
}
|
||||
|
||||
// ReportFailure records a transient failure. When the consecutive-failure
|
||||
// count reaches the threshold the key is backed off and the method reports
|
||||
// true; the count then resets so re-admission requires a fresh run of
|
||||
// failures to trigger the next (longer) backoff.
|
||||
func (t *Tracker) ReportFailure(key string) (backedOff bool) {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
e, ok := t.entries[key]
|
||||
if !ok {
|
||||
e = &entry{}
|
||||
t.entries[key] = e
|
||||
}
|
||||
e.consecutiveFailures++
|
||||
if e.consecutiveFailures < t.cfg.FailureThreshold {
|
||||
return false
|
||||
}
|
||||
cooldown := t.cooldownFor(e.backoffs)
|
||||
e.until = t.cfg.Clock().Add(cooldown)
|
||||
e.backoffs++
|
||||
e.consecutiveFailures = 0
|
||||
return true
|
||||
}
|
||||
|
||||
// BackedOffUntil returns the end of the key's current backoff window, or the
|
||||
// zero time when the key is not backed off. Useful for diagnostics and error
|
||||
// messages.
|
||||
func (t *Tracker) BackedOffUntil(key string) time.Time {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
e, ok := t.entries[key]
|
||||
if !ok || !t.cfg.Clock().Before(e.until) {
|
||||
return time.Time{}
|
||||
}
|
||||
return e.until
|
||||
}
|
||||
|
||||
// cooldownFor computes the cooldown for the n-th consecutive backoff
|
||||
// (0-based): base * multiplier^n, capped at MaxCooldown.
|
||||
func (t *Tracker) cooldownFor(n int) time.Duration {
|
||||
d := float64(t.cfg.BaseCooldown)
|
||||
for range n {
|
||||
d *= t.cfg.Multiplier
|
||||
if time.Duration(d) >= t.cfg.MaxCooldown {
|
||||
return t.cfg.MaxCooldown
|
||||
}
|
||||
}
|
||||
if time.Duration(d) > t.cfg.MaxCooldown {
|
||||
return t.cfg.MaxCooldown
|
||||
}
|
||||
return time.Duration(d)
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// fakeClock is a manually-advanced clock for deterministic backoff tests.
|
||||
type fakeClock struct {
|
||||
mu sync.Mutex
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newFakeClock() *fakeClock {
|
||||
return &fakeClock{now: time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC)}
|
||||
}
|
||||
|
||||
func (c *fakeClock) Now() time.Time {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
return c.now
|
||||
}
|
||||
|
||||
func (c *fakeClock) Advance(d time.Duration) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.now = c.now.Add(d)
|
||||
}
|
||||
|
||||
func newTestTracker(clock *fakeClock) *Tracker {
|
||||
return NewTracker(Config{
|
||||
FailureThreshold: 2,
|
||||
BaseCooldown: 5 * time.Second,
|
||||
MaxCooldown: 5 * time.Minute,
|
||||
Multiplier: 2,
|
||||
Clock: clock.Now,
|
||||
})
|
||||
}
|
||||
|
||||
func TestSingleFailureStaysAvailable(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
if backedOff := tr.ReportFailure("k"); backedOff {
|
||||
t.Error("first failure must not back off")
|
||||
}
|
||||
if !tr.Available("k") {
|
||||
t.Error("key should remain available after one failure")
|
||||
}
|
||||
}
|
||||
|
||||
func TestThresholdTriggersBackoff(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
tr.ReportFailure("k")
|
||||
if backedOff := tr.ReportFailure("k"); !backedOff {
|
||||
t.Error("second consecutive failure should back off")
|
||||
}
|
||||
if tr.Available("k") {
|
||||
t.Error("key should be unavailable during backoff")
|
||||
}
|
||||
if until := tr.BackedOffUntil("k"); !until.Equal(clock.Now().Add(5 * time.Second)) {
|
||||
t.Errorf("BackedOffUntil = %v, want now+5s", until)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCooldownExpiryReadmits(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
tr.ReportFailure("k")
|
||||
tr.ReportFailure("k")
|
||||
clock.Advance(5*time.Second - time.Millisecond)
|
||||
if tr.Available("k") {
|
||||
t.Error("still inside cooldown")
|
||||
}
|
||||
clock.Advance(time.Millisecond)
|
||||
if !tr.Available("k") {
|
||||
t.Error("cooldown expiry should re-admit the key")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExponentialCooldownWithCap(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
|
||||
// Consecutive backoffs: 5s, 10s, 20s, ... capped at 5m.
|
||||
wantCooldowns := []time.Duration{
|
||||
5 * time.Second, 10 * time.Second, 20 * time.Second, 40 * time.Second,
|
||||
80 * time.Second, 160 * time.Second, 5 * time.Minute, 5 * time.Minute,
|
||||
}
|
||||
for i, want := range wantCooldowns {
|
||||
tr.ReportFailure("k")
|
||||
tr.ReportFailure("k")
|
||||
until := tr.BackedOffUntil("k")
|
||||
if got := until.Sub(clock.Now()); got != want {
|
||||
t.Fatalf("backoff #%d cooldown = %v, want %v", i+1, got, want)
|
||||
}
|
||||
clock.Advance(want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSuccessResetsEverything(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
|
||||
// Build up to a long cooldown...
|
||||
for range 3 {
|
||||
tr.ReportFailure("k")
|
||||
tr.ReportFailure("k")
|
||||
clock.Advance(tr.BackedOffUntil("k").Sub(clock.Now()))
|
||||
}
|
||||
// ...then a success resets both the count and the exponent.
|
||||
tr.ReportSuccess("k")
|
||||
tr.ReportFailure("k")
|
||||
if !tr.Available("k") {
|
||||
t.Error("one failure after success must not back off")
|
||||
}
|
||||
tr.ReportFailure("k")
|
||||
if got := tr.BackedOffUntil("k").Sub(clock.Now()); got != 5*time.Second {
|
||||
t.Errorf("post-reset cooldown = %v, want base 5s", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKeysAreIndependent(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
tr.ReportFailure("a")
|
||||
tr.ReportFailure("a")
|
||||
if tr.Available("a") {
|
||||
t.Error("a should be backed off")
|
||||
}
|
||||
if !tr.Available("b") {
|
||||
t.Error("b must be unaffected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultsApplied(t *testing.T) {
|
||||
tr := NewTracker(Config{})
|
||||
if tr.cfg.FailureThreshold != DefaultFailureThreshold ||
|
||||
tr.cfg.BaseCooldown != DefaultBaseCooldown ||
|
||||
tr.cfg.MaxCooldown != DefaultMaxCooldown ||
|
||||
tr.cfg.Multiplier != DefaultMultiplier ||
|
||||
tr.cfg.Clock == nil {
|
||||
t.Errorf("defaults not applied: %+v", tr.cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTrackerConcurrency(t *testing.T) {
|
||||
clock := newFakeClock()
|
||||
tr := newTestTracker(clock)
|
||||
var wg sync.WaitGroup
|
||||
for i := range 8 {
|
||||
wg.Add(1)
|
||||
go func(n int) {
|
||||
defer wg.Done()
|
||||
key := []string{"a", "b"}[n%2]
|
||||
for range 200 {
|
||||
tr.ReportFailure(key)
|
||||
tr.Available(key)
|
||||
tr.ReportSuccess(key)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
Reference in New Issue
Block a user