run: fix statusFor — don't relabel a generic error / caller-cancel as timeout (gadfly #11)
executus CI / test (pull_request) Successful in 47s
executus CI / test (push) Successful in 45s

The WithCancelCause+timer rewrite made MaxRuntime surface as Canceled (not
DeadlineExceeded), so statusFor's context.Cause(DeadlineExceeded) check could
relabel (a) a genuine run error as 'timeout' and (b) a caller cancel/deadline as
'timeout' (was 'cancelled'). Convergent gadfly finding (glm-5.2 + cluster).

Fix: keep MaxRuntime as WithTimeout (its DeadlineExceeded propagates → 'timeout',
preserving own-timeout vs caller-cancel), add a NESTED WithCancelCause layer only
for the kill. statusFor consults context.Cause ONLY for ErrCriticKill; everything
else is classified by the run error itself. Tests: generic-error-not-relabeled +
caller-cancel-stays-cancelled.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-27 17:00:26 -04:00
parent 390e6cf905
commit be4bbbcad5
2 changed files with 21 additions and 7 deletions
+15 -7
View File
@@ -197,13 +197,16 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
// cancellation still propagates via MergeCancellation. Created BEFORE the
// step observer so the observer forwards the merged run context (not a
// possibly-cancelled caller ctx) to OnStep consumers.
// Cause-carrying cancel so a critic kill, a backstop timeout, and a caller
// cancel land as distinct statuses. MaxRuntime is enforced by a timer that
// cancels with DeadlineExceeded (preserving the old WithTimeout → "timeout").
runCtx, cancelCause := context.WithCancelCause(context.WithoutCancel(ctx))
// MaxRuntime stays a WithTimeout so its DeadlineExceeded propagates through the
// child chain (→ "timeout"), preserving the run's-own-timeout vs caller-cancel
// distinction. A NESTED cause-carrying layer lets a critic kill surface as a
// distinct "killed" without disturbing that: only an ErrCriticKill cause is
// consulted in statusFor; a generic run error or a caller cancel is classified
// by the run error itself.
timeoutCtx, cancelTimeout := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime)
defer cancelTimeout()
runCtx, cancelCause := context.WithCancelCause(timeoutCtx)
defer cancelCause(nil)
runTimer := time.AfterFunc(maxRuntime, func() { cancelCause(context.DeadlineExceeded) })
defer runTimer.Stop()
runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
defer mergeCancel()
@@ -303,9 +306,14 @@ func statusFor(runCtx context.Context, runErr error) string {
switch {
case runErr == nil:
return "ok"
// Only the kill is recovered from the cancellation cause — a critic kill
// surfaces as a plain Canceled run error, so without this it'd read as
// "cancelled". Everything else is classified by the run error itself, so a
// genuine run error is never relabeled just because the context was later
// cancelled, and a caller cancel/deadline stays "cancelled" (not "timeout").
case errors.Is(context.Cause(runCtx), ErrCriticKill):
return "killed"
case errors.Is(runErr, context.DeadlineExceeded) || errors.Is(context.Cause(runCtx), context.DeadlineExceeded):
case errors.Is(runErr, context.DeadlineExceeded):
return "timeout"
case errors.Is(runErr, context.Canceled):
return "cancelled"
+6
View File
@@ -155,6 +155,10 @@ func TestStatusFor(t *testing.T) {
// context.Cause carries ErrCriticKill → "killed".
killCtx, killCancel := context.WithCancelCause(context.Background())
killCancel(fmt.Errorf("%w: hung", ErrCriticKill))
// A context cancelled with a non-kill cause must NOT relabel a genuine run
// error: a real error stays "error" even though the ctx was later cancelled.
cancelledCtx, cc := context.WithCancelCause(context.Background())
cc(context.DeadlineExceeded)
cases := []struct {
ctx context.Context
err error
@@ -166,6 +170,8 @@ func TestStatusFor(t *testing.T) {
{bg, fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"},
{bg, errors.New("boom"), "error"},
{killCtx, context.Canceled, "killed"},
{cancelledCtx, errors.New("boom"), "error"}, // generic error not relabeled by cause
{cancelledCtx, context.Canceled, "cancelled"}, // caller cancel stays cancelled, not timeout
}
for _, c := range cases {
if got := statusFor(c.ctx, c.err); got != c.want {