fix(failover): preserve manual bench against automatic cooldown downgrade
CI / Build, Test & Lint (push) Successful in 10m48s
CI / Build, Test & Lint (push) Successful in 10m48s
recordTransientFailure and benchNow unconditionally set manual=false and reset until to now+cooldown. When the best-effort all-benched failover path re-tries a model an operator manually benched via BenchModel, a subsequent failure downgraded manual=true -> false and could shorten the operator's window to the short auto cooldown. Both functions now read existing state: if it is an active manual bench (manual && now.Before(until)) they bump consecutiveFails but keep manual=true and the later until. Non-manual or expired-manual state still gets the automatic cooldown. Adds TestFailover_ManualBenchSurvivesAutomaticDowngrade covering no-prior, prior-auto, active-manual, and expired-manual cases. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -127,7 +127,14 @@ func (h *modelHealth) recordTransientFailure(key string, cooldown time.Duration,
|
|||||||
h.mu.Lock()
|
h.mu.Lock()
|
||||||
defer h.mu.Unlock()
|
defer h.mu.Unlock()
|
||||||
st := h.disabled[key]
|
st := h.disabled[key]
|
||||||
|
// Preserve an active manual bench: automatic logic must not clear the
|
||||||
|
// operator's manual flag or shorten their window. Still count the failure.
|
||||||
|
manualActive := st.manual && now.Before(st.until)
|
||||||
st.consecutiveFails++
|
st.consecutiveFails++
|
||||||
|
if manualActive {
|
||||||
|
h.disabled[key] = st
|
||||||
|
return true, st.until
|
||||||
|
}
|
||||||
if st.consecutiveFails >= benchThreshold {
|
if st.consecutiveFails >= benchThreshold {
|
||||||
st.until = now.Add(cooldown)
|
st.until = now.Add(cooldown)
|
||||||
st.manual = false
|
st.manual = false
|
||||||
@@ -143,7 +150,14 @@ func (h *modelHealth) benchNow(key string, cooldown time.Duration, now time.Time
|
|||||||
h.mu.Lock()
|
h.mu.Lock()
|
||||||
defer h.mu.Unlock()
|
defer h.mu.Unlock()
|
||||||
st := h.disabled[key]
|
st := h.disabled[key]
|
||||||
|
// Preserve an active manual bench: automatic logic must not clear the
|
||||||
|
// operator's manual flag or shorten their window. Still count the failure.
|
||||||
|
manualActive := st.manual && now.Before(st.until)
|
||||||
st.consecutiveFails++
|
st.consecutiveFails++
|
||||||
|
if manualActive {
|
||||||
|
h.disabled[key] = st
|
||||||
|
return st.until
|
||||||
|
}
|
||||||
st.until = now.Add(cooldown)
|
st.until = now.Add(cooldown)
|
||||||
st.manual = false
|
st.manual = false
|
||||||
h.disabled[key] = st
|
h.disabled[key] = st
|
||||||
|
|||||||
@@ -300,3 +300,82 @@ func TestNewFailoverModel_Flattens(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestFailover_ManualBenchSurvivesAutomaticDowngrade is a regression test:
|
||||||
|
// an active manual bench (long window) must NOT be cleared or shortened by the
|
||||||
|
// automatic recordTransientFailure/benchNow paths. The best-effort all-benched
|
||||||
|
// failover loop can re-try a manually benched model; if it fails, the automatic
|
||||||
|
// logic previously overwrote manual=true -> false and shortened until to the
|
||||||
|
// short auto cooldown. The operator's intent must win.
|
||||||
|
func TestFailover_ManualBenchSurvivesAutomaticDowngrade(t *testing.T) {
|
||||||
|
resetHealthForTest()
|
||||||
|
|
||||||
|
const key = "p/manual"
|
||||||
|
longUntil := time.Now().Add(time.Hour)
|
||||||
|
shortCooldown := time.Minute
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
// Operator manually benches the model for a long window.
|
||||||
|
BenchModel(key, longUntil)
|
||||||
|
|
||||||
|
assertManualPreserved := func(stage string) {
|
||||||
|
t.Helper()
|
||||||
|
list := ListBenched()
|
||||||
|
var got *BenchedModel
|
||||||
|
for i := range list {
|
||||||
|
if list[i].Model == key {
|
||||||
|
got = &list[i]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got == nil {
|
||||||
|
t.Fatalf("%s: %q missing from ListBenched; manual bench was cleared", stage, key)
|
||||||
|
}
|
||||||
|
if !got.Manual {
|
||||||
|
t.Errorf("%s: Manual=false, want true (automatic logic downgraded manual bench)", stage)
|
||||||
|
}
|
||||||
|
if !got.Until.Equal(longUntil) {
|
||||||
|
t.Errorf("%s: Until=%v, want %v (automatic logic shortened operator window)", stage, got.Until, longUntil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// (c) Active manual bench hit by the automatic transient path.
|
||||||
|
benched, until := globalHealth.recordTransientFailure(key, shortCooldown, now)
|
||||||
|
if !benched {
|
||||||
|
t.Errorf("recordTransientFailure: benched=false, want true (model stays benched)")
|
||||||
|
}
|
||||||
|
if !until.Equal(longUntil) {
|
||||||
|
t.Errorf("recordTransientFailure: until=%v, want %v (long manual window preserved)", until, longUntil)
|
||||||
|
}
|
||||||
|
assertManualPreserved("after recordTransientFailure")
|
||||||
|
|
||||||
|
// (c) Active manual bench hit by the automatic auth-dead path.
|
||||||
|
until = globalHealth.benchNow(key, shortCooldown, now)
|
||||||
|
if !until.Equal(longUntil) {
|
||||||
|
t.Errorf("benchNow: until=%v, want %v (long manual window preserved)", until, longUntil)
|
||||||
|
}
|
||||||
|
assertManualPreserved("after benchNow")
|
||||||
|
|
||||||
|
// (d) Expired manual bench: automatic logic IS allowed to take over.
|
||||||
|
resetHealthForTest()
|
||||||
|
expired := time.Now().Add(-time.Hour)
|
||||||
|
BenchModel(key, expired)
|
||||||
|
until = globalHealth.benchNow(key, shortCooldown, now)
|
||||||
|
if want := now.Add(shortCooldown); !until.Equal(want) {
|
||||||
|
t.Errorf("benchNow on expired manual: until=%v, want auto cooldown %v", until, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
// (a) No prior state and (b) prior automatic bench: automatic cooldown applies.
|
||||||
|
resetHealthForTest()
|
||||||
|
benched, until = globalHealth.recordTransientFailure(key, shortCooldown, now) // (a)
|
||||||
|
if !benched || !until.Equal(now.Add(shortCooldown)) {
|
||||||
|
t.Errorf("recordTransientFailure(no prior): benched=%v until=%v, want true %v", benched, until, now.Add(shortCooldown))
|
||||||
|
}
|
||||||
|
until = globalHealth.benchNow(key, shortCooldown, now) // (b) prior automatic state
|
||||||
|
if want := now.Add(shortCooldown); !until.Equal(want) {
|
||||||
|
t.Errorf("benchNow(prior auto): until=%v, want %v", until, want)
|
||||||
|
}
|
||||||
|
if list := ListBenched(); len(list) != 1 || list[0].Manual {
|
||||||
|
t.Errorf("automatic bench should not be Manual: %+v", list)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user