run: critic parity — fuller RecordStep + cause-carrying Kill (distinct status)
executus CI / test (pull_request) Successful in 46s
Adversarial Review (Gadfly) / review (pull_request) Successful in 22m30s

Completes the run-critic seam so a host adapter (mort's agentcritic) has full
fidelity, closing the two limitations gadfly surfaced on mort #1334.

- RecordStep(iter int, resp *llm.Response): the completed step's model response
  is now passed to the critic (was index-only), so a host that records a trace
  (mort's ProgressRecorder) can show what the agent actually produced, not just
  an iteration count. The executor forwards s.Response; the battery ignores it
  (its Progress is count-based).
- CriticHandle.KillCause() error + ErrCriticKill: the executor now distinguishes
  an explicit critic KILL from a natural backstop expiry. runCtx uses a
  cause-carrying cancel (WithCancelCause + a MaxRuntime timer cancelling with
  DeadlineExceeded); the deadline-watch cancels with ErrCriticKill when
  KillCause()!=nil, else DeadlineExceeded. statusFor reads context.Cause →
  killed / timeout / cancelled are now distinct (were all "cancelled"). The
  battery sets killCause from Decision.KillReason on a Kill.

Tests: statusFor "killed" case (cause=ErrCriticKill, err=Canceled); fake handle
+ battery RecordStep/KillCause signatures. Core stays battery-free.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-27 16:35:13 -04:00
parent 1a1d5e417b
commit 390e6cf905
7 changed files with 98 additions and 36 deletions
+19 -4
View File
@@ -18,6 +18,7 @@ package critic
import ( import (
"context" "context"
"errors"
"log/slog" "log/slog"
"math" "math"
"sync" "sync"
@@ -162,12 +163,15 @@ type handle struct {
iterations int iterations int
maxSteps int // current tool-dispatch ceiling (base MaxIterations, raised by RaiseStepsBy) maxSteps int // current tool-dispatch ceiling (base MaxIterations, raised by RaiseStepsBy)
lastTool string lastTool string
killed bool // sticky: once an Escalator kills, no later decision un-kills it killed bool // sticky: once an Escalator kills, no later decision un-kills it
killCause error // non-nil once killed; surfaced via KillCause for "killed" status
stopped bool stopped bool
stopCh chan struct{} stopCh chan struct{}
} }
func (h *handle) RecordStep(iter int) { func (h *handle) RecordStep(iter int, _ *llm.Response) {
// This battery's Progress tracks iteration count + activity, not per-step
// payload, so the response is unused here; a richer Escalator could record it.
h.mu.Lock() h.mu.Lock()
h.iterations = iter h.iterations = iter
h.lastActivity = h.now() h.lastActivity = h.now()
@@ -204,6 +208,12 @@ func (h *handle) MaxSteps() int {
return h.maxSteps return h.maxSteps
} }
func (h *handle) KillCause() error {
h.mu.Lock()
defer h.mu.Unlock()
return h.killCause
}
func (h *handle) Stop() { func (h *handle) Stop() {
h.mu.Lock() h.mu.Lock()
if !h.stopped { if !h.stopped {
@@ -266,8 +276,13 @@ func (h *handle) tick(ctx context.Context) {
} }
if d.Kill { if d.Kill {
h.killed = true h.killed = true
h.deadline = h.now() // immediate hard deadline → executor cancels reason := d.KillReason
return // ignore any Nudge/ExtendBy paired with a Kill if reason == "" {
reason = "critic killed the run"
}
h.killCause = errors.New(reason) // surfaced via KillCause → "killed" status
h.deadline = h.now() // immediate hard deadline → executor cancels
return // ignore any Nudge/ExtendBy paired with a Kill
} }
if len(d.Nudge) > 0 { if len(d.Nudge) > 0 {
h.steer = append(h.steer, d.Nudge...) h.steer = append(h.steer, d.Nudge...)
+1 -1
View File
@@ -51,7 +51,7 @@ func TestMonitorEscalatesOncePerIdlePeriodAndExtends(t *testing.T) {
t.Error("deadline should have been extended past the original") t.Error("deadline should have been extended past the original")
} }
// A fresh step re-arms; another idle period escalates again. // A fresh step re-arms; another idle period escalates again.
h.RecordStep(1) h.RecordStep(1, nil)
time.Sleep(60 * time.Millisecond) time.Sleep(60 * time.Millisecond)
mu.Lock() mu.Lock()
c2 := calls c2 := calls
+20 -10
View File
@@ -2,9 +2,11 @@ package run
import ( import (
"context" "context"
"fmt"
"time" "time"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent" "gitea.stevedudenhoeffer.com/steve/majordomo/agent"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
) )
// criticDeadlineCheck is how often the deadline-watch goroutine polls the // criticDeadlineCheck is how often the deadline-watch goroutine polls the
@@ -21,12 +23,15 @@ type criticBinding struct {
} }
// startCritic begins critic monitoring for this run when one is configured and // startCritic begins critic monitoring for this run when one is configured and
// the agent enables it. It launches a goroutine that cancels runCtx (via cancel) // the agent enables it. It launches a goroutine that cancels runCtx (via
// the moment the critic's hard deadline passes — the critic may extend that // cancelCause) the moment the critic's hard deadline passes — the critic may
// deadline, so a healthy-but-slow run is given room while a hung one is killed. // extend that deadline, so a healthy-but-slow run is given room while a hung one
// Returns (nil, no-op stop) when there is no critic. The caller MUST defer the // is killed. When the deadline passes because the critic KILLED the run
// returned stop. // (KillCause() != nil), the cancellation cause is ErrCriticKill (→ status
func (e *Executor) startCritic(runCtx context.Context, cancel context.CancelFunc, ra RunnableAgent, info RunInfo) (*criticBinding, func()) { // "killed"); when the backstop simply expired, it is context.DeadlineExceeded (→
// "timeout"). Returns (nil, no-op stop) when there is no critic. The caller MUST
// defer the returned stop.
func (e *Executor) startCritic(runCtx context.Context, cancelCause context.CancelCauseFunc, ra RunnableAgent, info RunInfo) (*criticBinding, func()) {
noop := func() {} noop := func() {}
if e.cfg.Ports.Critic == nil || !ra.Critic.Enabled { if e.cfg.Ports.Critic == nil || !ra.Critic.Enabled {
return nil, noop return nil, noop
@@ -55,9 +60,14 @@ func (e *Executor) startCritic(runCtx context.Context, cancel context.CancelFunc
return return
case <-t.C: case <-t.C:
// A zero deadline = no hard cap (not yet set); otherwise cancel // A zero deadline = no hard cap (not yet set); otherwise cancel
// once we're at or past it. // once we're at or past it, distinguishing an explicit kill from a
// natural backstop expiry so the run gets the right status.
if d := h.Deadline(); !d.IsZero() && !time.Now().Before(d) { if d := h.Deadline(); !d.IsZero() && !time.Now().Before(d) {
cancel() if cause := h.KillCause(); cause != nil {
cancelCause(fmt.Errorf("%w: %s", ErrCriticKill, cause.Error()))
} else {
cancelCause(context.DeadlineExceeded)
}
return return
} }
} }
@@ -69,9 +79,9 @@ func (e *Executor) startCritic(runCtx context.Context, cancel context.CancelFunc
} }
} }
func (b *criticBinding) recordStep(iter int) { func (b *criticBinding) recordStep(iter int, resp *llm.Response) {
if b != nil { if b != nil {
b.h.RecordStep(iter) b.h.RecordStep(iter, resp)
} }
} }
+8 -2
View File
@@ -23,10 +23,16 @@ type fakeCriticHandle struct {
mu sync.Mutex mu sync.Mutex
steps, tools, stops int steps, tools, stops int
steered int steered int
maxSteps int // 0 => defer to the run's base MaxIterations maxSteps int // 0 => defer to the run's base MaxIterations
killCause error // non-nil simulates a critic kill
} }
func (h *fakeCriticHandle) RecordStep(int) { h.mu.Lock(); h.steps++; h.mu.Unlock() } func (h *fakeCriticHandle) RecordStep(int, *llm.Response) { h.mu.Lock(); h.steps++; h.mu.Unlock() }
func (h *fakeCriticHandle) KillCause() error {
h.mu.Lock()
defer h.mu.Unlock()
return h.killCause
}
func (h *fakeCriticHandle) RecordToolStart(string, string) { func (h *fakeCriticHandle) RecordToolStart(string, string) {
h.mu.Lock() h.mu.Lock()
h.tools++ h.tools++
+19 -10
View File
@@ -197,15 +197,20 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
// cancellation still propagates via MergeCancellation. Created BEFORE the // cancellation still propagates via MergeCancellation. Created BEFORE the
// step observer so the observer forwards the merged run context (not a // step observer so the observer forwards the merged run context (not a
// possibly-cancelled caller ctx) to OnStep consumers. // possibly-cancelled caller ctx) to OnStep consumers.
runCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime) // Cause-carrying cancel so a critic kill, a backstop timeout, and a caller
defer cancel() // cancel land as distinct statuses. MaxRuntime is enforced by a timer that
// cancels with DeadlineExceeded (preserving the old WithTimeout → "timeout").
runCtx, cancelCause := context.WithCancelCause(context.WithoutCancel(ctx))
defer cancelCause(nil)
runTimer := time.AfterFunc(maxRuntime, func() { cancelCause(context.DeadlineExceeded) })
defer runTimer.Stop()
runCtx, mergeCancel := MergeCancellation(runCtx, ctx) runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
defer mergeCancel() defer mergeCancel()
// Critic (optional): monitors the run for a stall, can nudge/extend/kill via // Critic (optional): monitors the run for a stall, can nudge/extend/kill via
// its host Escalator. Its hard deadline is bound to runCtx (cancel on pass). // its host Escalator. Its hard deadline is bound to runCtx (cancel on pass).
// nil-safe: no-op when no critic is configured or the agent doesn't enable it. // nil-safe: no-op when no critic is configured or the agent doesn't enable it.
critic, stopCritic := e.startCritic(runCtx, cancel, ra, info) critic, stopCritic := e.startCritic(runCtx, cancelCause, ra, info)
defer stopCritic() defer stopCritic()
// Step instrumentation: accumulate Result.Steps + fire inv.OnStep, feed the // Step instrumentation: accumulate Result.Steps + fire inv.OnStep, feed the
@@ -222,7 +227,7 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
if rec != nil { if rec != nil {
rec.OnStep(s.Index, s.Response) rec.OnStep(s.Index, s.Response)
} }
critic.recordStep(s.Index) // keep the critic's activity clock fresh critic.recordStep(s.Index, s.Response) // keep the critic's activity clock fresh + carry the step payload
var calls []llm.ToolCall var calls []llm.ToolCall
if s.Response != nil { if s.Response != nil {
calls = s.Response.ToolCalls calls = s.Response.ToolCalls
@@ -273,7 +278,7 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
ag := agent.New(model, e.systemPrompt(ra), opts...) ag := agent.New(model, e.systemPrompt(ra), opts...)
runRes, runErr := ag.Run(runCtx, input, critic.steerOptions()...) runRes, runErr := ag.Run(runCtx, input, critic.steerOptions()...)
status := statusFor(runErr) status := statusFor(runCtx, runErr)
if runRes != nil { if runRes != nil {
res.Output = runRes.Output res.Output = runRes.Output
res.Usage = runRes.Usage res.Usage = runRes.Usage
@@ -289,14 +294,18 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
return res return res
} }
// statusFor maps a run error to a RunStats.Status, distinguishing a deadline // statusFor maps a run error to a RunStats.Status, distinguishing a critic kill
// (timeout) and a cancellation (cancelled — caller cancel or shutdown) from a // (killed), a deadline (timeout), and a cancellation (cancelled — caller cancel
// generic error so audit consumers can tell them apart. // or shutdown) from a generic error so audit consumers can tell them apart. The
func statusFor(runErr error) string { // run context's cancellation cause carries the distinction (ErrCriticKill /
// DeadlineExceeded), since ctx.Err() alone only reports Canceled.
func statusFor(runCtx context.Context, runErr error) string {
switch { switch {
case runErr == nil: case runErr == nil:
return "ok" return "ok"
case errors.Is(runErr, context.DeadlineExceeded): case errors.Is(context.Cause(runCtx), ErrCriticKill):
return "killed"
case errors.Is(runErr, context.DeadlineExceeded) || errors.Is(context.Cause(runCtx), context.DeadlineExceeded):
return "timeout" return "timeout"
case errors.Is(runErr, context.Canceled): case errors.Is(runErr, context.Canceled):
return "cancelled" return "cancelled"
+14 -7
View File
@@ -148,20 +148,27 @@ func TestExecutorNilModelNoPanic(t *testing.T) {
} }
} }
// TestStatusFor maps run errors to RunStats.Status (gadfly F3). // TestStatusFor maps run errors + cancellation cause to RunStats.Status (gadfly F3).
func TestStatusFor(t *testing.T) { func TestStatusFor(t *testing.T) {
bg := context.Background()
// A context cancelled with the critic-kill cause: ctx.Err() is Canceled, but
// context.Cause carries ErrCriticKill → "killed".
killCtx, killCancel := context.WithCancelCause(context.Background())
killCancel(fmt.Errorf("%w: hung", ErrCriticKill))
cases := []struct { cases := []struct {
ctx context.Context
err error err error
want string want string
}{ }{
{nil, "ok"}, {bg, nil, "ok"},
{context.DeadlineExceeded, "timeout"}, {bg, context.DeadlineExceeded, "timeout"},
{context.Canceled, "cancelled"}, {bg, context.Canceled, "cancelled"},
{fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"}, {bg, fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"},
{errors.New("boom"), "error"}, {bg, errors.New("boom"), "error"},
{killCtx, context.Canceled, "killed"},
} }
for _, c := range cases { for _, c := range cases {
if got := statusFor(c.err); got != c.want { if got := statusFor(c.ctx, c.err); got != c.want {
t.Errorf("statusFor(%v) = %q, want %q", c.err, got, c.want) t.Errorf("statusFor(%v) = %q, want %q", c.err, got, c.want)
} }
} }
+17 -2
View File
@@ -2,6 +2,7 @@ package run
import ( import (
"context" "context"
"errors"
"time" "time"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm" "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
@@ -9,6 +10,12 @@ import (
"gitea.stevedudenhoeffer.com/steve/executus/deliver" "gitea.stevedudenhoeffer.com/steve/executus/deliver"
) )
// ErrCriticKill is the cancellation cause the executor stamps on a run the
// critic kills, so a critic kill surfaces as a distinct "killed" status (vs a
// backstop "timeout" or a caller "cancelled"). A host CriticHandle signals a
// kill via KillCause(); the executor wraps that reason with this sentinel.
var ErrCriticKill = errors.New("run: critic killed the run")
// Ports are the host seams the run executor consumes. Every field is nil-safe: // Ports are the host seams the run executor consumes. Every field is nil-safe:
// a light host passes the zero Ports and gets a bounded, in-memory run with no // a light host passes the zero Ports and gets a bounded, in-memory run with no
// persistence, audit, budget, critic, delegation, or delivery — which is // persistence, audit, budget, critic, delegation, or delivery — which is
@@ -123,8 +130,10 @@ type Critic interface {
// methods (the critic battery's handle guards its state with a mutex). // methods (the critic battery's handle guards its state with a mutex).
type CriticHandle interface { type CriticHandle interface {
// RecordStep / RecordToolStart keep the critic's activity clock fresh so a // RecordStep / RecordToolStart keep the critic's activity clock fresh so a
// healthy-but-slow run is not mistaken for a hang. // healthy-but-slow run is not mistaken for a hang. RecordStep also carries the
RecordStep(iter int) // completed step's model response (nil-safe) so the critic's Trace can show
// what the agent actually produced, not just an iteration count.
RecordStep(iter int, resp *llm.Response)
RecordToolStart(name, args string) RecordToolStart(name, args string)
// Steer returns any messages the critic wants injected into the loop (a // Steer returns any messages the critic wants injected into the loop (a
// nudge), drained before each step — matches majordomo agent.WithSteer. // nudge), drained before each step — matches majordomo agent.WithSteer.
@@ -137,6 +146,12 @@ type CriticHandle interface {
// healthy-but-long run's iteration budget mid-flight. Return <= 0 to defer to // healthy-but-long run's iteration budget mid-flight. Return <= 0 to defer to
// the run's base MaxIterations. // the run's base MaxIterations.
MaxSteps() int MaxSteps() int
// KillCause returns a non-nil reason iff the critic has decided to KILL this
// run (as opposed to letting the hard-deadline backstop expire). The executor
// reads it when the deadline passes: non-nil → cancel the run with
// ErrCriticKill (status "killed"); nil → the backstop expired naturally
// (status "timeout"). Hosts that never distinguish the two may return nil.
KillCause() error
// Stop ends monitoring when the run finishes. // Stop ends monitoring when the run finishes.
Stop() Stop()
} }