fix: address verified gadfly P4c findings (3-cloud fleet)
executus CI / test (pull_request) Successful in 1m39s
executus CI / test (pull_request) Successful in 1m39s
critic (all 3 models — HIGH): - ExtendOnce was a single global one-shot shared across every run a System monitors, so only the FIRST run to stall got its extension and all others were killed by the backstop. Key the fired-state per run (RunInfo.RunID). - Kill is now sticky: a `killed` flag short-circuits later ticks so a wavering Escalator returning ExtendBy after a Kill can't un-collapse the deadline; a Kill paired with Nudge/ExtendBy ignores the latter. - watch() recovers panics from a misbehaving Escalator (logs; the run falls back to its existing deadline) instead of silently killing the watch goroutine. checkpoint (deepseek — HIGH): handle.Save advanced the throttle clock BEFORE the store write, so a failed save was silently throttled away (caller believes it persisted). Advance lastSave only after a successful persist. schedule (all 3): compute Next BEFORE Run — a permanently-unparseable cron now skips the job entirely instead of re-running it every tick forever; nil required callbacks return a validate() error instead of a first-tick nil panic; Loop recovers tick panics; the Mark-failure => possible-re-run trade-off is documented (Run must be idempotent). + tests for each. Triaged-but-kept: critic backstopMul<=1 floor (it's a total-runtime multiple, so a floor >1 is intentional, not the reported footgun); checkpoint Load (nil,nil) on miss (documented convention). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+49
-11
@@ -16,6 +16,7 @@ package critic
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -47,25 +48,34 @@ type Escalator interface {
|
||||
OnSoftTimeout(ctx context.Context, info run.RunInfo, p Progress) Decision
|
||||
}
|
||||
|
||||
// ExtendOnce is the default Escalator: the first time a run stalls it extends
|
||||
// the deadline by By (giving a slow-but-healthy run room), then takes no
|
||||
// further action — so a genuinely hung run is later killed by the hard
|
||||
// backstop. A nil/zero By falls back to one soft-timeout's worth.
|
||||
// ExtendOnce is the default Escalator: the first time a given run stalls it
|
||||
// extends that run's deadline by By (giving a slow-but-healthy run room), then
|
||||
// takes no further action for it — so a genuinely hung run is later killed by
|
||||
// the hard backstop. A nil/zero By falls back to one soft-timeout's worth.
|
||||
//
|
||||
// The one-shot is keyed PER RUN (by RunInfo.RunID): a single System shares one
|
||||
// ExtendOnce across every run it monitors, so a global flag would let only the
|
||||
// first run to stall ever get its extension. The fired set grows with the
|
||||
// number of distinct runs that stall — fine for a process's run volume; a host
|
||||
// running unboundedly long can construct a fresh System periodically.
|
||||
type ExtendOnce struct {
|
||||
By time.Duration
|
||||
|
||||
mu sync.Mutex
|
||||
fired bool
|
||||
fired map[string]bool // run ids that have already had their one extension
|
||||
}
|
||||
|
||||
// OnSoftTimeout implements Escalator.
|
||||
func (e *ExtendOnce) OnSoftTimeout(_ context.Context, _ run.RunInfo, p Progress) Decision {
|
||||
func (e *ExtendOnce) OnSoftTimeout(_ context.Context, info run.RunInfo, p Progress) Decision {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
if e.fired {
|
||||
if e.fired[info.RunID] {
|
||||
return Decision{}
|
||||
}
|
||||
e.fired = true
|
||||
if e.fired == nil {
|
||||
e.fired = map[string]bool{}
|
||||
}
|
||||
e.fired[info.RunID] = true
|
||||
by := e.By
|
||||
if by <= 0 {
|
||||
by = p.Idle // ~one soft timeout
|
||||
@@ -80,6 +90,14 @@ type System struct {
|
||||
backstopMul float64 // hard deadline = softTimeout * backstopMul from start
|
||||
checkInterval time.Duration
|
||||
now func() time.Time
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func (s *System) log() *slog.Logger {
|
||||
if s.logger != nil {
|
||||
return s.logger
|
||||
}
|
||||
return slog.Default()
|
||||
}
|
||||
|
||||
// New builds a run.Critic. esc is the policy (nil → ExtendOnce). backstopMul is
|
||||
@@ -138,6 +156,7 @@ type handle struct {
|
||||
steer []llm.Message
|
||||
iterations int
|
||||
lastTool string
|
||||
killed bool // sticky: once an Escalator kills, no later decision un-kills it
|
||||
stopped bool
|
||||
stopCh chan struct{}
|
||||
}
|
||||
@@ -185,6 +204,14 @@ func (h *handle) Stop() {
|
||||
// watch fires the Escalator once per idle period the run crosses its soft
|
||||
// timeout, and applies the returned Decision.
|
||||
func (h *handle) watch(ctx context.Context, interval time.Duration) {
|
||||
// A misbehaving Escalator that panics must not silently kill the watch
|
||||
// goroutine (which would leave the run unmonitored for its lifetime). Log
|
||||
// and exit cleanly — the run falls back to the deadline already set.
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
h.sys.log().Error("critic watch panicked; run is now unmonitored", "run", h.info.RunID, "panic", r)
|
||||
}
|
||||
}()
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
@@ -201,6 +228,12 @@ func (h *handle) watch(ctx context.Context, interval time.Duration) {
|
||||
|
||||
func (h *handle) tick(ctx context.Context) {
|
||||
h.mu.Lock()
|
||||
// Kill is sticky: once an Escalator has killed this run, no later tick (and
|
||||
// no later Decision) un-collapses the deadline.
|
||||
if h.killed {
|
||||
h.mu.Unlock()
|
||||
return
|
||||
}
|
||||
idle := h.now().Sub(h.lastActivity)
|
||||
// Only escalate once per idle period: skip if we already escalated for this
|
||||
// exact lastActivity (a fresh step/tool updates lastActivity and re-arms).
|
||||
@@ -216,13 +249,18 @@ func (h *handle) tick(ctx context.Context) {
|
||||
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.killed { // a concurrent tick may have killed while OnSoftTimeout ran
|
||||
return
|
||||
}
|
||||
if d.Kill {
|
||||
h.killed = true
|
||||
h.deadline = h.now() // immediate hard deadline → executor cancels
|
||||
return // ignore any Nudge/ExtendBy paired with a Kill
|
||||
}
|
||||
if len(d.Nudge) > 0 {
|
||||
h.steer = append(h.steer, d.Nudge...)
|
||||
}
|
||||
if d.ExtendBy > 0 {
|
||||
h.deadline = h.deadline.Add(d.ExtendBy)
|
||||
}
|
||||
if d.Kill {
|
||||
h.deadline = h.now() // immediate hard deadline → executor cancels
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user