run: fix statusFor — don't relabel a generic error / caller-cancel as timeout (gadfly #11)
The WithCancelCause+timer rewrite made MaxRuntime surface as Canceled (not DeadlineExceeded), so statusFor's context.Cause(DeadlineExceeded) check could relabel (a) a genuine run error as 'timeout' and (b) a caller cancel/deadline as 'timeout' (was 'cancelled'). Convergent gadfly finding (glm-5.2 + cluster). Fix: keep MaxRuntime as WithTimeout (its DeadlineExceeded propagates → 'timeout', preserving own-timeout vs caller-cancel), add a NESTED WithCancelCause layer only for the kill. statusFor consults context.Cause ONLY for ErrCriticKill; everything else is classified by the run error itself. Tests: generic-error-not-relabeled + caller-cancel-stays-cancelled. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+15
-7
@@ -197,13 +197,16 @@ func (e *Executor) Run(ctx context.Context, ra RunnableAgent, inv tool.Invocatio
|
|||||||
// cancellation still propagates via MergeCancellation. Created BEFORE the
|
// cancellation still propagates via MergeCancellation. Created BEFORE the
|
||||||
// step observer so the observer forwards the merged run context (not a
|
// step observer so the observer forwards the merged run context (not a
|
||||||
// possibly-cancelled caller ctx) to OnStep consumers.
|
// possibly-cancelled caller ctx) to OnStep consumers.
|
||||||
// Cause-carrying cancel so a critic kill, a backstop timeout, and a caller
|
// MaxRuntime stays a WithTimeout so its DeadlineExceeded propagates through the
|
||||||
// cancel land as distinct statuses. MaxRuntime is enforced by a timer that
|
// child chain (→ "timeout"), preserving the run's-own-timeout vs caller-cancel
|
||||||
// cancels with DeadlineExceeded (preserving the old WithTimeout → "timeout").
|
// distinction. A NESTED cause-carrying layer lets a critic kill surface as a
|
||||||
runCtx, cancelCause := context.WithCancelCause(context.WithoutCancel(ctx))
|
// distinct "killed" without disturbing that: only an ErrCriticKill cause is
|
||||||
|
// consulted in statusFor; a generic run error or a caller cancel is classified
|
||||||
|
// by the run error itself.
|
||||||
|
timeoutCtx, cancelTimeout := context.WithTimeout(context.WithoutCancel(ctx), maxRuntime)
|
||||||
|
defer cancelTimeout()
|
||||||
|
runCtx, cancelCause := context.WithCancelCause(timeoutCtx)
|
||||||
defer cancelCause(nil)
|
defer cancelCause(nil)
|
||||||
runTimer := time.AfterFunc(maxRuntime, func() { cancelCause(context.DeadlineExceeded) })
|
|
||||||
defer runTimer.Stop()
|
|
||||||
runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
|
runCtx, mergeCancel := MergeCancellation(runCtx, ctx)
|
||||||
defer mergeCancel()
|
defer mergeCancel()
|
||||||
|
|
||||||
@@ -303,9 +306,14 @@ func statusFor(runCtx context.Context, runErr error) string {
|
|||||||
switch {
|
switch {
|
||||||
case runErr == nil:
|
case runErr == nil:
|
||||||
return "ok"
|
return "ok"
|
||||||
|
// Only the kill is recovered from the cancellation cause — a critic kill
|
||||||
|
// surfaces as a plain Canceled run error, so without this it'd read as
|
||||||
|
// "cancelled". Everything else is classified by the run error itself, so a
|
||||||
|
// genuine run error is never relabeled just because the context was later
|
||||||
|
// cancelled, and a caller cancel/deadline stays "cancelled" (not "timeout").
|
||||||
case errors.Is(context.Cause(runCtx), ErrCriticKill):
|
case errors.Is(context.Cause(runCtx), ErrCriticKill):
|
||||||
return "killed"
|
return "killed"
|
||||||
case errors.Is(runErr, context.DeadlineExceeded) || errors.Is(context.Cause(runCtx), context.DeadlineExceeded):
|
case errors.Is(runErr, context.DeadlineExceeded):
|
||||||
return "timeout"
|
return "timeout"
|
||||||
case errors.Is(runErr, context.Canceled):
|
case errors.Is(runErr, context.Canceled):
|
||||||
return "cancelled"
|
return "cancelled"
|
||||||
|
|||||||
@@ -155,6 +155,10 @@ func TestStatusFor(t *testing.T) {
|
|||||||
// context.Cause carries ErrCriticKill → "killed".
|
// context.Cause carries ErrCriticKill → "killed".
|
||||||
killCtx, killCancel := context.WithCancelCause(context.Background())
|
killCtx, killCancel := context.WithCancelCause(context.Background())
|
||||||
killCancel(fmt.Errorf("%w: hung", ErrCriticKill))
|
killCancel(fmt.Errorf("%w: hung", ErrCriticKill))
|
||||||
|
// A context cancelled with a non-kill cause must NOT relabel a genuine run
|
||||||
|
// error: a real error stays "error" even though the ctx was later cancelled.
|
||||||
|
cancelledCtx, cc := context.WithCancelCause(context.Background())
|
||||||
|
cc(context.DeadlineExceeded)
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
ctx context.Context
|
ctx context.Context
|
||||||
err error
|
err error
|
||||||
@@ -166,6 +170,8 @@ func TestStatusFor(t *testing.T) {
|
|||||||
{bg, fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"},
|
{bg, fmt.Errorf("wrapped: %w", context.DeadlineExceeded), "timeout"},
|
||||||
{bg, errors.New("boom"), "error"},
|
{bg, errors.New("boom"), "error"},
|
||||||
{killCtx, context.Canceled, "killed"},
|
{killCtx, context.Canceled, "killed"},
|
||||||
|
{cancelledCtx, errors.New("boom"), "error"}, // generic error not relabeled by cause
|
||||||
|
{cancelledCtx, context.Canceled, "cancelled"}, // caller cancel stays cancelled, not timeout
|
||||||
}
|
}
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
if got := statusFor(c.ctx, c.err); got != c.want {
|
if got := statusFor(c.ctx, c.err); got != c.want {
|
||||||
|
|||||||
Reference in New Issue
Block a user