fix(failover): preserve manual bench against automatic cooldown downgrade
CI / Build, Test & Lint (push) Successful in 10m48s
CI / Build, Test & Lint (push) Successful in 10m48s
recordTransientFailure and benchNow unconditionally set manual=false and reset until to now+cooldown. When the best-effort all-benched failover path re-tries a model an operator manually benched via BenchModel, a subsequent failure downgraded manual=true -> false and could shorten the operator's window to the short auto cooldown. Both functions now read existing state: if it is an active manual bench (manual && now.Before(until)) they bump consecutiveFails but keep manual=true and the later until. Non-manual or expired-manual state still gets the automatic cooldown. Adds TestFailover_ManualBenchSurvivesAutomaticDowngrade covering no-prior, prior-auto, active-manual, and expired-manual cases. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -300,3 +300,82 @@ func TestNewFailoverModel_Flattens(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestFailover_ManualBenchSurvivesAutomaticDowngrade is a regression test:
|
||||
// an active manual bench (long window) must NOT be cleared or shortened by the
|
||||
// automatic recordTransientFailure/benchNow paths. The best-effort all-benched
|
||||
// failover loop can re-try a manually benched model; if it fails, the automatic
|
||||
// logic previously overwrote manual=true -> false and shortened until to the
|
||||
// short auto cooldown. The operator's intent must win.
|
||||
func TestFailover_ManualBenchSurvivesAutomaticDowngrade(t *testing.T) {
|
||||
resetHealthForTest()
|
||||
|
||||
const key = "p/manual"
|
||||
longUntil := time.Now().Add(time.Hour)
|
||||
shortCooldown := time.Minute
|
||||
now := time.Now()
|
||||
|
||||
// Operator manually benches the model for a long window.
|
||||
BenchModel(key, longUntil)
|
||||
|
||||
assertManualPreserved := func(stage string) {
|
||||
t.Helper()
|
||||
list := ListBenched()
|
||||
var got *BenchedModel
|
||||
for i := range list {
|
||||
if list[i].Model == key {
|
||||
got = &list[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatalf("%s: %q missing from ListBenched; manual bench was cleared", stage, key)
|
||||
}
|
||||
if !got.Manual {
|
||||
t.Errorf("%s: Manual=false, want true (automatic logic downgraded manual bench)", stage)
|
||||
}
|
||||
if !got.Until.Equal(longUntil) {
|
||||
t.Errorf("%s: Until=%v, want %v (automatic logic shortened operator window)", stage, got.Until, longUntil)
|
||||
}
|
||||
}
|
||||
|
||||
// (c) Active manual bench hit by the automatic transient path.
|
||||
benched, until := globalHealth.recordTransientFailure(key, shortCooldown, now)
|
||||
if !benched {
|
||||
t.Errorf("recordTransientFailure: benched=false, want true (model stays benched)")
|
||||
}
|
||||
if !until.Equal(longUntil) {
|
||||
t.Errorf("recordTransientFailure: until=%v, want %v (long manual window preserved)", until, longUntil)
|
||||
}
|
||||
assertManualPreserved("after recordTransientFailure")
|
||||
|
||||
// (c) Active manual bench hit by the automatic auth-dead path.
|
||||
until = globalHealth.benchNow(key, shortCooldown, now)
|
||||
if !until.Equal(longUntil) {
|
||||
t.Errorf("benchNow: until=%v, want %v (long manual window preserved)", until, longUntil)
|
||||
}
|
||||
assertManualPreserved("after benchNow")
|
||||
|
||||
// (d) Expired manual bench: automatic logic IS allowed to take over.
|
||||
resetHealthForTest()
|
||||
expired := time.Now().Add(-time.Hour)
|
||||
BenchModel(key, expired)
|
||||
until = globalHealth.benchNow(key, shortCooldown, now)
|
||||
if want := now.Add(shortCooldown); !until.Equal(want) {
|
||||
t.Errorf("benchNow on expired manual: until=%v, want auto cooldown %v", until, want)
|
||||
}
|
||||
|
||||
// (a) No prior state and (b) prior automatic bench: automatic cooldown applies.
|
||||
resetHealthForTest()
|
||||
benched, until = globalHealth.recordTransientFailure(key, shortCooldown, now) // (a)
|
||||
if !benched || !until.Equal(now.Add(shortCooldown)) {
|
||||
t.Errorf("recordTransientFailure(no prior): benched=%v until=%v, want true %v", benched, until, now.Add(shortCooldown))
|
||||
}
|
||||
until = globalHealth.benchNow(key, shortCooldown, now) // (b) prior automatic state
|
||||
if want := now.Add(shortCooldown); !until.Equal(want) {
|
||||
t.Errorf("benchNow(prior auto): until=%v, want %v", until, want)
|
||||
}
|
||||
if list := ListBenched(); len(list) != 1 || list[0].Manual {
|
||||
t.Errorf("automatic bench should not be Manual: %+v", list)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user