From 361999550eee61994b94052b32440dd3df1352f0 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Mon, 1 Jun 2026 00:37:38 +0200 Subject: [PATCH] fix(failover): preserve manual bench against automatic cooldown downgrade recordTransientFailure and benchNow unconditionally set manual=false and reset until to now+cooldown. When the best-effort all-benched failover path re-tries a model an operator manually benched via BenchModel, a subsequent failure downgraded manual=true -> false and could shorten the operator's window to the short auto cooldown. Both functions now read existing state: if it is an active manual bench (manual && now.Before(until)) they bump consecutiveFails but keep manual=true and the later until. Non-manual or expired-manual state still gets the automatic cooldown. Adds TestFailover_ManualBenchSurvivesAutomaticDowngrade covering no-prior, prior-auto, active-manual, and expired-manual cases. Co-Authored-By: Claude Opus 4.8 --- v2/failover.go | 14 ++++++++ v2/failover_test.go | 79 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/v2/failover.go b/v2/failover.go index f3fcef1..9c9fb41 100644 --- a/v2/failover.go +++ b/v2/failover.go @@ -127,7 +127,14 @@ func (h *modelHealth) recordTransientFailure(key string, cooldown time.Duration, h.mu.Lock() defer h.mu.Unlock() st := h.disabled[key] + // Preserve an active manual bench: automatic logic must not clear the + // operator's manual flag or shorten their window. Still count the failure. + manualActive := st.manual && now.Before(st.until) st.consecutiveFails++ + if manualActive { + h.disabled[key] = st + return true, st.until + } if st.consecutiveFails >= benchThreshold { st.until = now.Add(cooldown) st.manual = false @@ -143,7 +150,14 @@ func (h *modelHealth) benchNow(key string, cooldown time.Duration, now time.Time h.mu.Lock() defer h.mu.Unlock() st := h.disabled[key] + // Preserve an active manual bench: automatic logic must not clear the + // operator's manual flag or shorten their window. Still count the failure. + manualActive := st.manual && now.Before(st.until) st.consecutiveFails++ + if manualActive { + h.disabled[key] = st + return st.until + } st.until = now.Add(cooldown) st.manual = false h.disabled[key] = st diff --git a/v2/failover_test.go b/v2/failover_test.go index 7c644cd..e039e5e 100644 --- a/v2/failover_test.go +++ b/v2/failover_test.go @@ -300,3 +300,82 @@ func TestNewFailoverModel_Flattens(t *testing.T) { } } } + +// TestFailover_ManualBenchSurvivesAutomaticDowngrade is a regression test: +// an active manual bench (long window) must NOT be cleared or shortened by the +// automatic recordTransientFailure/benchNow paths. The best-effort all-benched +// failover loop can re-try a manually benched model; if it fails, the automatic +// logic previously overwrote manual=true -> false and shortened until to the +// short auto cooldown. The operator's intent must win. +func TestFailover_ManualBenchSurvivesAutomaticDowngrade(t *testing.T) { + resetHealthForTest() + + const key = "p/manual" + longUntil := time.Now().Add(time.Hour) + shortCooldown := time.Minute + now := time.Now() + + // Operator manually benches the model for a long window. + BenchModel(key, longUntil) + + assertManualPreserved := func(stage string) { + t.Helper() + list := ListBenched() + var got *BenchedModel + for i := range list { + if list[i].Model == key { + got = &list[i] + break + } + } + if got == nil { + t.Fatalf("%s: %q missing from ListBenched; manual bench was cleared", stage, key) + } + if !got.Manual { + t.Errorf("%s: Manual=false, want true (automatic logic downgraded manual bench)", stage) + } + if !got.Until.Equal(longUntil) { + t.Errorf("%s: Until=%v, want %v (automatic logic shortened operator window)", stage, got.Until, longUntil) + } + } + + // (c) Active manual bench hit by the automatic transient path. + benched, until := globalHealth.recordTransientFailure(key, shortCooldown, now) + if !benched { + t.Errorf("recordTransientFailure: benched=false, want true (model stays benched)") + } + if !until.Equal(longUntil) { + t.Errorf("recordTransientFailure: until=%v, want %v (long manual window preserved)", until, longUntil) + } + assertManualPreserved("after recordTransientFailure") + + // (c) Active manual bench hit by the automatic auth-dead path. + until = globalHealth.benchNow(key, shortCooldown, now) + if !until.Equal(longUntil) { + t.Errorf("benchNow: until=%v, want %v (long manual window preserved)", until, longUntil) + } + assertManualPreserved("after benchNow") + + // (d) Expired manual bench: automatic logic IS allowed to take over. + resetHealthForTest() + expired := time.Now().Add(-time.Hour) + BenchModel(key, expired) + until = globalHealth.benchNow(key, shortCooldown, now) + if want := now.Add(shortCooldown); !until.Equal(want) { + t.Errorf("benchNow on expired manual: until=%v, want auto cooldown %v", until, want) + } + + // (a) No prior state and (b) prior automatic bench: automatic cooldown applies. + resetHealthForTest() + benched, until = globalHealth.recordTransientFailure(key, shortCooldown, now) // (a) + if !benched || !until.Equal(now.Add(shortCooldown)) { + t.Errorf("recordTransientFailure(no prior): benched=%v until=%v, want true %v", benched, until, now.Add(shortCooldown)) + } + until = globalHealth.benchNow(key, shortCooldown, now) // (b) prior automatic state + if want := now.Add(shortCooldown); !until.Equal(want) { + t.Errorf("benchNow(prior auto): until=%v, want %v", until, want) + } + if list := ListBenched(); len(list) != 1 || list[0].Manual { + t.Errorf("automatic bench should not be Manual: %+v", list) + } +}