Files
steve 390e6cf905
executus CI / test (pull_request) Successful in 46s
Adversarial Review (Gadfly) / review (pull_request) Successful in 22m30s
run: critic parity — fuller RecordStep + cause-carrying Kill (distinct status)
Completes the run-critic seam so a host adapter (mort's agentcritic) has full
fidelity, closing the two limitations gadfly surfaced on mort #1334.

- RecordStep(iter int, resp *llm.Response): the completed step's model response
  is now passed to the critic (was index-only), so a host that records a trace
  (mort's ProgressRecorder) can show what the agent actually produced, not just
  an iteration count. The executor forwards s.Response; the battery ignores it
  (its Progress is count-based).
- CriticHandle.KillCause() error + ErrCriticKill: the executor now distinguishes
  an explicit critic KILL from a natural backstop expiry. runCtx uses a
  cause-carrying cancel (WithCancelCause + a MaxRuntime timer cancelling with
  DeadlineExceeded); the deadline-watch cancels with ErrCriticKill when
  KillCause()!=nil, else DeadlineExceeded. statusFor reads context.Cause →
  killed / timeout / cancelled are now distinct (were all "cancelled"). The
  battery sets killCause from Decision.KillReason on a Kill.

Tests: statusFor "killed" case (cause=ErrCriticKill, err=Canceled); fake handle
+ battery RecordStep/KillCause signatures. Core stays battery-free.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 16:35:13 -04:00

101 lines
3.1 KiB
Go

package critic
import (
"context"
"sync"
"testing"
"time"
"gitea.stevedudenhoeffer.com/steve/executus/run"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// escFunc adapts a func to an Escalator.
type escFunc func(context.Context, run.RunInfo, Progress) Decision
func (f escFunc) OnSoftTimeout(ctx context.Context, i run.RunInfo, p Progress) Decision {
return f(ctx, i, p)
}
func TestMonitorEscalatesOncePerIdlePeriodAndExtends(t *testing.T) {
var mu sync.Mutex
var calls int
esc := escFunc(func(_ context.Context, _ run.RunInfo, p Progress) Decision {
mu.Lock()
calls++
mu.Unlock()
return Decision{ExtendBy: 50 * time.Millisecond, Nudge: []llm.Message{{Role: llm.RoleUser}}}
})
s := New(esc, 3)
s.checkInterval = 5 * time.Millisecond
h := s.Monitor(context.Background(), run.RunInfo{RunID: "r"}, 20*time.Millisecond)
defer h.Stop()
d0 := h.Deadline()
time.Sleep(60 * time.Millisecond) // cross the soft timeout with no activity
mu.Lock()
c := calls
mu.Unlock()
if c < 1 {
t.Fatalf("expected at least one escalation, got %d", c)
}
// Nudge was queued and is drained once.
if msgs := h.Steer(); len(msgs) == 0 {
t.Error("expected a queued steer nudge")
}
if msgs := h.Steer(); len(msgs) != 0 {
t.Error("steer should drain (be empty on second read)")
}
// Deadline was extended.
if !h.Deadline().After(d0) {
t.Error("deadline should have been extended past the original")
}
// A fresh step re-arms; another idle period escalates again.
h.RecordStep(1, nil)
time.Sleep(60 * time.Millisecond)
mu.Lock()
c2 := calls
mu.Unlock()
if c2 <= c {
t.Errorf("a re-armed idle period should escalate again (%d -> %d)", c, c2)
}
}
func TestKillCollapsesDeadline(t *testing.T) {
esc := escFunc(func(context.Context, run.RunInfo, Progress) Decision {
return Decision{Kill: true, KillReason: "hung"}
})
s := New(esc, 10) // big backstop so only Kill collapses it
s.checkInterval = 5 * time.Millisecond
h := s.Monitor(context.Background(), run.RunInfo{RunID: "r"}, 20*time.Millisecond)
defer h.Stop()
time.Sleep(60 * time.Millisecond)
if h.Deadline().After(time.Now().Add(time.Second)) {
t.Error("Kill should collapse the deadline to ~now")
}
}
func TestExtendOnceOnlyFiresOnce(t *testing.T) {
e := &ExtendOnce{By: time.Minute}
// Same run id: only the first call extends.
d1 := e.OnSoftTimeout(context.Background(), run.RunInfo{RunID: "r1"}, Progress{})
d2 := e.OnSoftTimeout(context.Background(), run.RunInfo{RunID: "r1"}, Progress{})
if d1.ExtendBy != time.Minute {
t.Errorf("first decision should extend, got %+v", d1)
}
if d2.ExtendBy != 0 || d2.Kill {
t.Errorf("second call for the same run should be a no-op, got %+v", d2)
}
// A DIFFERENT run still gets its own one extension (per-run, not global).
if d3 := e.OnSoftTimeout(context.Background(), run.RunInfo{RunID: "r2"}, Progress{}); d3.ExtendBy != time.Minute {
t.Errorf("a different run should get its own extension, got %+v", d3)
}
}
func TestZeroSoftTimeoutNotMonitored(t *testing.T) {
s := New(nil, 3)
if h := s.Monitor(context.Background(), run.RunInfo{}, 0); h != nil {
t.Error("zero soft timeout should return a nil handle (not monitored)")
}
}