From be4bbbcad54ab6693262b3ed42bf77a531c5790c Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 17:00:26 -0400 Subject: [PATCH] =?UTF-8?q?run:=20fix=20statusFor=20=E2=80=94=20don't=20re?= =?UTF-8?q?label=20a=20generic=20error=20/=20caller-cancel=20as=20timeout?= =?UTF-8?q?=20(gadfly=20#11)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WithCancelCause+timer rewrite made MaxRuntime surface as Canceled (not DeadlineExceeded), so statusFor's context.Cause(DeadlineExceeded) check could relabel (a) a genuine run error as 'timeout' and (b) a caller cancel/deadline as 'timeout' (was 'cancelled'). Convergent gadfly finding (glm-5.2 + cluster). Fix: keep MaxRuntime as WithTimeout (its DeadlineExceeded propagates → 'timeout', preserving own-timeout vs caller-cancel), add a NESTED WithCancelCause layer only for the kill. statusFor consults context.Cause ONLY for ErrCriticKill; everything else is classified by the run error itself. Tests: generic-error-not-relabeled + caller-cancel-stays-cancelled. Co-Authored-By: Claude Opus 4.8 (1M context) --- run/executor.go | 22 +++++++++++++++------- run/executor_test.go | 6 ++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/run/executor.go b/run/executor.go index 61400e0..9cad80e 100644 --- a/run/executor.go +++ b/run/executor.go @@ -197,13 +197,16 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio // cancellation still propagates via MergeCancellation. Created BEFORE the // step observer so the observer forwards the merged run context (not a // possibly-cancelled caller ctx) to OnStep consumers. - // Cause-carrying cancel so a critic kill, a backstop timeout, and a caller - // cancel land as distinct statuses. MaxRuntime is enforced by a timer that - // cancels with DeadlineExceeded (preserving the old WithTimeout → "timeout"). - runCtx, cancelCause := context.WithCancelCause(context.WithoutCancel(ctx)) + // MaxRuntime stays a WithTimeout so its DeadlineExceeded propagates through the + // child chain (→ "timeout"), preserving the run's-own-timeout vs caller-cancel + // distinction. A NESTED cause-carrying layer lets a critic kill surface as a + // distinct "killed" without disturbing that: only an ErrCriticKill cause is + // consulted in statusFor; a generic run error or a caller cancel is classified + // by the run error itself. + timeoutCtx, cancelTimeout := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime) + defer cancelTimeout() + runCtx, cancelCause := context.WithCancelCause(timeoutCtx) defer cancelCause(nil) - runTimer := time.AfterFunc(maxRuntime, func() { cancelCause(context.DeadlineExceeded) }) - defer runTimer.Stop() runCtx, mergeCancel := MergeCancellation(runCtx, ctx) defer mergeCancel() @@ -303,9 +306,14 @@ func statusFor(runCtx context.Context, runErr error) string { switch { case runErr == nil: return "ok" + // Only the kill is recovered from the cancellation cause — a critic kill + // surfaces as a plain Canceled run error, so without this it'd read as + // "cancelled". Everything else is classified by the run error itself, so a + // genuine run error is never relabeled just because the context was later + // cancelled, and a caller cancel/deadline stays "cancelled" (not "timeout"). case errors.Is(context.Cause(runCtx), ErrCriticKill): return "killed" - case errors.Is(runErr, context.DeadlineExceeded) || errors.Is(context.Cause(runCtx), context.DeadlineExceeded): + case errors.Is(runErr, context.DeadlineExceeded): return "timeout" case errors.Is(runErr, context.Canceled): return "cancelled" diff --git a/run/executor_test.go b/run/executor_test.go index 85e498b..16b5038 100644 --- a/run/executor_test.go +++ b/run/executor_test.go @@ -155,6 +155,10 @@ func TestStatusFor(t *testing.T) { // context.Cause carries ErrCriticKill → "killed". killCtx, killCancel := context.WithCancelCause(context.Background()) killCancel(fmt.Errorf("%w: hung", ErrCriticKill)) + // A context cancelled with a non-kill cause must NOT relabel a genuine run + // error: a real error stays "error" even though the ctx was later cancelled. + cancelledCtx, cc := context.WithCancelCause(context.Background()) + cc(context.DeadlineExceeded) cases := []struct { ctx context.Context err error @@ -166,6 +170,8 @@ func TestStatusFor(t *testing.T) { {bg, fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"}, {bg, errors.New("boom"), "error"}, {killCtx, context.Canceled, "killed"}, + {cancelledCtx, errors.New("boom"), "error"}, // generic error not relabeled by cause + {cancelledCtx, context.Canceled, "cancelled"}, // caller cancel stays cancelled, not timeout } for _, c := range cases { if got := statusFor(c.ctx, c.err); got != c.want {