cb4c612461
executus CI / test (pull_request) Successful in 1m45s
All 11 findings were real (3 clusters): - Failsafe ceiling could pre-empt the critic's backstop (e9c9483f, 9109317b, d5a9bf0d, 76ad171e): CriticAbsoluteMax was 6h, but the host's backstop (MaxRuntime × multiplier, or its own absolute max) can reach 6h+, so the ceiling fired first and reintroduced a premature hard cap. Now CriticAbsoluteMax is a 24h RUNAWAY guard set far beyond any realistic backstop (the host clamps its own backstop to a much smaller absolute max, e.g. mort's 6h convar), so it never pre-empts a healthy supervised run. Comments corrected. - nil Monitor handle lost the MaxRuntime cap (df016a6f, 9dd42827): a critic-enabled run whose host Monitor returned no handle had no deadline-watch and was bounded only by the generous ceiling. Added an unsupervised-run failsafe that re-wraps runCtx to the nominal MaxRuntime when the critic is enabled but didn't arm. New test TestCriticOwnsDeadline_NilHandleFallsBackToMaxRuntime. - CriticSoftTimeout vestigial / dead fallback (f7764919, 9805bebe, 6864086f, b2b11721): the soft trigger is now always the resolved MaxRuntime (> 0), so the CriticSoftTimeout field + its startCritic fallback were unreachable. Removed the field entirely; the remaining 90s floor is documented as defensive-only. - DRY (f30ce827): extracted e.criticOwnsDeadline(ra), now the single predicate used by both Run and startCritic so they can't drift. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Jo75sqmeVPgFUWZQBn179X
129 lines
4.7 KiB
Go
129 lines
4.7 KiB
Go
package run_test
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
|
"gitea.stevedudenhoeffer.com/steve/executus/tool"
|
|
)
|
|
|
|
type fakeCritic struct{ h *fakeCriticHandle }
|
|
|
|
func (c *fakeCritic) Monitor(_ context.Context, _ run.RunInfo, _ time.Duration) run.CriticHandle {
|
|
return c.h
|
|
}
|
|
|
|
type fakeCriticHandle struct {
|
|
mu sync.Mutex
|
|
steps, tools, stops int
|
|
steered int
|
|
maxSteps int // 0 => defer to the run's base MaxIterations
|
|
killCause error // non-nil simulates a critic kill
|
|
}
|
|
|
|
func (h *fakeCriticHandle) RecordStep(int, *llm.Response) { h.mu.Lock(); h.steps++; h.mu.Unlock() }
|
|
func (h *fakeCriticHandle) KillCause() error {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
return h.killCause
|
|
}
|
|
func (h *fakeCriticHandle) RecordToolStart(string, string) {
|
|
h.mu.Lock()
|
|
h.tools++
|
|
h.mu.Unlock()
|
|
}
|
|
func (h *fakeCriticHandle) Steer() []llm.Message { h.mu.Lock(); h.steered++; h.mu.Unlock(); return nil }
|
|
func (h *fakeCriticHandle) Deadline() time.Time { return time.Time{} } // no hard deadline
|
|
func (h *fakeCriticHandle) MaxSteps() int { h.mu.Lock(); defer h.mu.Unlock(); return h.maxSteps }
|
|
func (h *fakeCriticHandle) Stop() { h.mu.Lock(); h.stops++; h.mu.Unlock() }
|
|
|
|
// TestCriticRaisesStepCeiling: a critic returning a higher MaxSteps lets the agent
|
|
// run PAST its base MaxIterations (the dynamic step ceiling). With base=1 and no
|
|
// critic the run would hit ErrMaxSteps after the first tool-dispatch step; the
|
|
// critic raises it to 5 so the run completes.
|
|
func TestCriticRaisesStepCeiling(t *testing.T) {
|
|
h := &fakeCriticHandle{maxSteps: 5}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m",
|
|
// two tool-call steps (unknown tool → tolerated error results), then answer
|
|
fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c1", Name: "noop", Arguments: []byte(`{}`)}}}),
|
|
fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c2", Name: "noop", Arguments: []byte(`{}`)}}}),
|
|
fake.Reply("done after 2 tool steps"),
|
|
)
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
// The fake handle's Deadline() is zero (no hard deadline), so the
|
|
// deadline-watch never interferes regardless of the soft trigger.
|
|
})
|
|
res := ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 1, Critic: run.CriticConfig{Enabled: true}},
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
if res.Err != nil {
|
|
t.Fatalf("critic raised the ceiling to 5, run should complete past base=1: %v", res.Err)
|
|
}
|
|
if res.Output != "done after 2 tool steps" {
|
|
t.Errorf("output = %q", res.Output)
|
|
}
|
|
}
|
|
|
|
// TestCriticWired: an agent with Critic.Enabled gets monitored — Monitor returns
|
|
// a handle the executor feeds (RecordStep), drains (Steer), and stops.
|
|
func TestCriticWired(t *testing.T) {
|
|
h := &fakeCriticHandle{}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("done"))
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
})
|
|
res := ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "watched", ModelTier: "m", Critic: run.CriticConfig{Enabled: true}},
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
if res.Err != nil {
|
|
t.Fatalf("run error: %v", res.Err)
|
|
}
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
if h.steps < 1 {
|
|
t.Errorf("critic should have seen >=1 step, got %d", h.steps)
|
|
}
|
|
if h.steered < 1 {
|
|
t.Errorf("critic Steer should be drained at least once, got %d", h.steered)
|
|
}
|
|
if h.stops != 1 {
|
|
t.Errorf("critic Stop should be called exactly once, got %d", h.stops)
|
|
}
|
|
}
|
|
|
|
// TestCriticDisabledNotMonitored: Critic.Enabled=false → Monitor never called.
|
|
func TestCriticDisabledNotMonitored(t *testing.T) {
|
|
h := &fakeCriticHandle{}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("done"))
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
})
|
|
ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "x", ModelTier: "m"}, // Critic.Enabled=false
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
if h.stops != 0 || h.steps != 0 {
|
|
t.Errorf("disabled critic should not be monitored: steps=%d stops=%d", h.steps, h.stops)
|
|
}
|
|
}
|