feat: foundations — canonical types, Parse grammar, env DSNs, health, chains

Phase 1 of the majordomo build: - llm/ canonical contract (messages, parts, tools, capabilities, streaming, Model/Provider, error classification) - health/ clock-injected tracker (threshold bench, exponential capped cooldown, reset-on-success) - root Registry + Parse (verbatim model ids, inline recursive alias expansion with cycle detection, chain dedup), LLM_* env-DSN providers (go-llm parity: lazy fallback + eager LoadEnv), health-aware chain executor behind the Model interface - provider/fake scriptable test provider; hermetic test suite incl. the trailing-thinking chain and foreman:// env loading - ADRs 0001-0008, CLAUDE.md, README (honest matrix), CI workflow, docs/phase-1-design.md Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 12:35:23 +02:00
parent 3025044817
commit dcd004289f
42 changed files with 3863 additions and 0 deletions
@@ -0,0 +1,163 @@
+// Package health tracks per-target model health for failover decisions.
+//
+// Why: a failover chain must skip targets that are repeatedly failing
+// ("backed off") and re-admit them after a cooldown, without any persistent
+// state or background goroutines. The tracker is in-memory, process-local,
+// thread-safe, and clock-injected so backoff is unit-testable.
+//
+// Semantics (see ADR-0006):
+//   - One transient failure increments a consecutive-failure count.
+//   - Reaching the failure threshold (default 2) backs the target off until
+//     now + cooldown. Cooldown grows exponentially per consecutive backoff
+//     (default base 5s, x2 each time, capped at 5m).
+//   - Any success fully resets the target: failure count and backoff
+//     history both clear.
+package health
+
+import (
+	"sync"
+	"time"
+)
+
+// Default configuration values.
+const (
+	DefaultFailureThreshold = 2
+	DefaultBaseCooldown     = 5 * time.Second
+	DefaultMaxCooldown      = 5 * time.Minute
+	DefaultMultiplier       = 2.0
+)
+
+// Clock supplies the current time; injected for tests.
+type Clock func() time.Time
+
+// Config tunes the tracker. Zero values select the defaults above.
+type Config struct {
+	// FailureThreshold is the number of consecutive transient failures that
+	// triggers a backoff.
+	FailureThreshold int
+	// BaseCooldown is the first backoff duration.
+	BaseCooldown time.Duration
+	// MaxCooldown caps the exponential growth.
+	MaxCooldown time.Duration
+	// Multiplier scales the cooldown per consecutive backoff.
+	Multiplier float64
+	// Clock supplies the current time (defaults to time.Now).
+	Clock Clock
+}
+
+func (c Config) withDefaults() Config {
+	if c.FailureThreshold <= 0 {
+		c.FailureThreshold = DefaultFailureThreshold
+	}
+	if c.BaseCooldown <= 0 {
+		c.BaseCooldown = DefaultBaseCooldown
+	}
+	if c.MaxCooldown <= 0 {
+		c.MaxCooldown = DefaultMaxCooldown
+	}
+	if c.Multiplier <= 1 {
+		c.Multiplier = DefaultMultiplier
+	}
+	if c.Clock == nil {
+		c.Clock = time.Now
+	}
+	return c
+}
+
+// Tracker records per-key health. Keys are opaque; majordomo uses
+// "provider/model-id".
+//
+// Tracker is an interface-free concrete type on purpose: consumers that want
+// persistence can wrap it behind their own interface; majordomo itself stays
+// in-memory (ADR-0006).
+type Tracker struct {
+	mu      sync.Mutex
+	cfg     Config
+	entries map[string]*entry
+}
+
+type entry struct {
+	// consecutiveFailures counts transient failures since the last success
+	// or backoff trigger.
+	consecutiveFailures int
+	// backoffs counts consecutive backoff rounds since the last success;
+	// it drives the exponential cooldown.
+	backoffs int
+	// until is the moment the current backoff expires (zero = not backed off).
+	until time.Time
+}
+
+// NewTracker creates a tracker with the given configuration.
+func NewTracker(cfg Config) *Tracker {
+	return &Tracker{cfg: cfg.withDefaults(), entries: make(map[string]*entry)}
+}
+
+// Available reports whether the key is currently usable (not backed off).
+func (t *Tracker) Available(key string) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	e, ok := t.entries[key]
+	if !ok {
+		return true
+	}
+	return !t.cfg.Clock().Before(e.until)
+}
+
+// ReportSuccess resets the key's failure count and backoff history.
+func (t *Tracker) ReportSuccess(key string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	delete(t.entries, key)
+}
+
+// ReportFailure records a transient failure. When the consecutive-failure
+// count reaches the threshold the key is backed off and the method reports
+// true; the count then resets so re-admission requires a fresh run of
+// failures to trigger the next (longer) backoff.
+func (t *Tracker) ReportFailure(key string) (backedOff bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	e, ok := t.entries[key]
+	if !ok {
+		e = &entry{}
+		t.entries[key] = e
+	}
+	e.consecutiveFailures++
+	if e.consecutiveFailures < t.cfg.FailureThreshold {
+		return false
+	}
+	cooldown := t.cooldownFor(e.backoffs)
+	e.until = t.cfg.Clock().Add(cooldown)
+	e.backoffs++
+	e.consecutiveFailures = 0
+	return true
+}
+
+// BackedOffUntil returns the end of the key's current backoff window, or the
+// zero time when the key is not backed off. Useful for diagnostics and error
+// messages.
+func (t *Tracker) BackedOffUntil(key string) time.Time {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	e, ok := t.entries[key]
+	if !ok || !t.cfg.Clock().Before(e.until) {
+		return time.Time{}
+	}
+	return e.until
+}
+
+// cooldownFor computes the cooldown for the n-th consecutive backoff
+// (0-based): base * multiplier^n, capped at MaxCooldown.
+func (t *Tracker) cooldownFor(n int) time.Duration {
+	d := float64(t.cfg.BaseCooldown)
+	for range n {
+		d *= t.cfg.Multiplier
+		if time.Duration(d) >= t.cfg.MaxCooldown {
+			return t.cfg.MaxCooldown
+		}
+	}
+	if time.Duration(d) > t.cfg.MaxCooldown {
+		return t.cfg.MaxCooldown
+	}
+	return time.Duration(d)
+}
@@ -0,0 +1,165 @@
+package health
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+// fakeClock is a manually-advanced clock for deterministic backoff tests.
+type fakeClock struct {
+	mu  sync.Mutex
+	now time.Time
+}
+
+func newFakeClock() *fakeClock {
+	return &fakeClock{now: time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC)}
+}
+
+func (c *fakeClock) Now() time.Time {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.now
+}
+
+func (c *fakeClock) Advance(d time.Duration) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.now = c.now.Add(d)
+}
+
+func newTestTracker(clock *fakeClock) *Tracker {
+	return NewTracker(Config{
+		FailureThreshold: 2,
+		BaseCooldown:     5 * time.Second,
+		MaxCooldown:      5 * time.Minute,
+		Multiplier:       2,
+		Clock:            clock.Now,
+	})
+}
+
+func TestSingleFailureStaysAvailable(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+	if backedOff := tr.ReportFailure("k"); backedOff {
+		t.Error("first failure must not back off")
+	}
+	if !tr.Available("k") {
+		t.Error("key should remain available after one failure")
+	}
+}
+
+func TestThresholdTriggersBackoff(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+	tr.ReportFailure("k")
+	if backedOff := tr.ReportFailure("k"); !backedOff {
+		t.Error("second consecutive failure should back off")
+	}
+	if tr.Available("k") {
+		t.Error("key should be unavailable during backoff")
+	}
+	if until := tr.BackedOffUntil("k"); !until.Equal(clock.Now().Add(5 * time.Second)) {
+		t.Errorf("BackedOffUntil = %v, want now+5s", until)
+	}
+}
+
+func TestCooldownExpiryReadmits(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+	tr.ReportFailure("k")
+	tr.ReportFailure("k")
+	clock.Advance(5*time.Second - time.Millisecond)
+	if tr.Available("k") {
+		t.Error("still inside cooldown")
+	}
+	clock.Advance(time.Millisecond)
+	if !tr.Available("k") {
+		t.Error("cooldown expiry should re-admit the key")
+	}
+}
+
+func TestExponentialCooldownWithCap(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+
+	// Consecutive backoffs: 5s, 10s, 20s, ... capped at 5m.
+	wantCooldowns := []time.Duration{
+		5 * time.Second, 10 * time.Second, 20 * time.Second, 40 * time.Second,
+		80 * time.Second, 160 * time.Second, 5 * time.Minute, 5 * time.Minute,
+	}
+	for i, want := range wantCooldowns {
+		tr.ReportFailure("k")
+		tr.ReportFailure("k")
+		until := tr.BackedOffUntil("k")
+		if got := until.Sub(clock.Now()); got != want {
+			t.Fatalf("backoff #%d cooldown = %v, want %v", i+1, got, want)
+		}
+		clock.Advance(want)
+	}
+}
+
+func TestSuccessResetsEverything(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+
+	// Build up to a long cooldown...
+	for range 3 {
+		tr.ReportFailure("k")
+		tr.ReportFailure("k")
+		clock.Advance(tr.BackedOffUntil("k").Sub(clock.Now()))
+	}
+	// ...then a success resets both the count and the exponent.
+	tr.ReportSuccess("k")
+	tr.ReportFailure("k")
+	if !tr.Available("k") {
+		t.Error("one failure after success must not back off")
+	}
+	tr.ReportFailure("k")
+	if got := tr.BackedOffUntil("k").Sub(clock.Now()); got != 5*time.Second {
+		t.Errorf("post-reset cooldown = %v, want base 5s", got)
+	}
+}
+
+func TestKeysAreIndependent(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+	tr.ReportFailure("a")
+	tr.ReportFailure("a")
+	if tr.Available("a") {
+		t.Error("a should be backed off")
+	}
+	if !tr.Available("b") {
+		t.Error("b must be unaffected")
+	}
+}
+
+func TestDefaultsApplied(t *testing.T) {
+	tr := NewTracker(Config{})
+	if tr.cfg.FailureThreshold != DefaultFailureThreshold ||
+		tr.cfg.BaseCooldown != DefaultBaseCooldown ||
+		tr.cfg.MaxCooldown != DefaultMaxCooldown ||
+		tr.cfg.Multiplier != DefaultMultiplier ||
+		tr.cfg.Clock == nil {
+		t.Errorf("defaults not applied: %+v", tr.cfg)
+	}
+}
+
+func TestTrackerConcurrency(t *testing.T) {
+	clock := newFakeClock()
+	tr := newTestTracker(clock)
+	var wg sync.WaitGroup
+	for i := range 8 {
+		wg.Add(1)
+		go func(n int) {
+			defer wg.Done()
+			key := []string{"a", "b"}[n%2]
+			for range 200 {
+				tr.ReportFailure(key)
+				tr.Available(key)
+				tr.ReportSuccess(key)
+			}
+		}(i)
+	}
+	wg.Wait()
+}