fix(failover): preserve manual bench against automatic cooldown downgrade
CI / Build, Test & Lint (push) Successful in 10m48s

recordTransientFailure and benchNow unconditionally set manual=false and
reset until to now+cooldown. When the best-effort all-benched failover path
re-tries a model an operator manually benched via BenchModel, a subsequent
failure downgraded manual=true -> false and could shorten the operator's
window to the short auto cooldown.

Both functions now read existing state: if it is an active manual bench
(manual && now.Before(until)) they bump consecutiveFails but keep manual=true
and the later until. Non-manual or expired-manual state still gets the
automatic cooldown. Adds TestFailover_ManualBenchSurvivesAutomaticDowngrade
covering no-prior, prior-auto, active-manual, and expired-manual cases.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-01 00:37:38 +02:00
parent ae8e194fad
commit 361999550e
2 changed files with 93 additions and 0 deletions
+79
View File
@@ -300,3 +300,82 @@ func TestNewFailoverModel_Flattens(t *testing.T) {
}
}
}
// TestFailover_ManualBenchSurvivesAutomaticDowngrade is a regression test:
// an active manual bench (long window) must NOT be cleared or shortened by the
// automatic recordTransientFailure/benchNow paths. The best-effort all-benched
// failover loop can re-try a manually benched model; if it fails, the automatic
// logic previously overwrote manual=true -> false and shortened until to the
// short auto cooldown. The operator's intent must win.
func TestFailover_ManualBenchSurvivesAutomaticDowngrade(t *testing.T) {
resetHealthForTest()
const key = "p/manual"
longUntil := time.Now().Add(time.Hour)
shortCooldown := time.Minute
now := time.Now()
// Operator manually benches the model for a long window.
BenchModel(key, longUntil)
assertManualPreserved := func(stage string) {
t.Helper()
list := ListBenched()
var got *BenchedModel
for i := range list {
if list[i].Model == key {
got = &list[i]
break
}
}
if got == nil {
t.Fatalf("%s: %q missing from ListBenched; manual bench was cleared", stage, key)
}
if !got.Manual {
t.Errorf("%s: Manual=false, want true (automatic logic downgraded manual bench)", stage)
}
if !got.Until.Equal(longUntil) {
t.Errorf("%s: Until=%v, want %v (automatic logic shortened operator window)", stage, got.Until, longUntil)
}
}
// (c) Active manual bench hit by the automatic transient path.
benched, until := globalHealth.recordTransientFailure(key, shortCooldown, now)
if !benched {
t.Errorf("recordTransientFailure: benched=false, want true (model stays benched)")
}
if !until.Equal(longUntil) {
t.Errorf("recordTransientFailure: until=%v, want %v (long manual window preserved)", until, longUntil)
}
assertManualPreserved("after recordTransientFailure")
// (c) Active manual bench hit by the automatic auth-dead path.
until = globalHealth.benchNow(key, shortCooldown, now)
if !until.Equal(longUntil) {
t.Errorf("benchNow: until=%v, want %v (long manual window preserved)", until, longUntil)
}
assertManualPreserved("after benchNow")
// (d) Expired manual bench: automatic logic IS allowed to take over.
resetHealthForTest()
expired := time.Now().Add(-time.Hour)
BenchModel(key, expired)
until = globalHealth.benchNow(key, shortCooldown, now)
if want := now.Add(shortCooldown); !until.Equal(want) {
t.Errorf("benchNow on expired manual: until=%v, want auto cooldown %v", until, want)
}
// (a) No prior state and (b) prior automatic bench: automatic cooldown applies.
resetHealthForTest()
benched, until = globalHealth.recordTransientFailure(key, shortCooldown, now) // (a)
if !benched || !until.Equal(now.Add(shortCooldown)) {
t.Errorf("recordTransientFailure(no prior): benched=%v until=%v, want true %v", benched, until, now.Add(shortCooldown))
}
until = globalHealth.benchNow(key, shortCooldown, now) // (b) prior automatic state
if want := now.Add(shortCooldown); !until.Equal(want) {
t.Errorf("benchNow(prior auto): until=%v, want %v", until, want)
}
if list := ListBenched(); len(list) != 1 || list[0].Manual {
t.Errorf("automatic bench should not be Manual: %+v", list)
}
}