390e6cf905
Completes the run-critic seam so a host adapter (mort's agentcritic) has full fidelity, closing the two limitations gadfly surfaced on mort #1334. - RecordStep(iter int, resp *llm.Response): the completed step's model response is now passed to the critic (was index-only), so a host that records a trace (mort's ProgressRecorder) can show what the agent actually produced, not just an iteration count. The executor forwards s.Response; the battery ignores it (its Progress is count-based). - CriticHandle.KillCause() error + ErrCriticKill: the executor now distinguishes an explicit critic KILL from a natural backstop expiry. runCtx uses a cause-carrying cancel (WithCancelCause + a MaxRuntime timer cancelling with DeadlineExceeded); the deadline-watch cancels with ErrCriticKill when KillCause()!=nil, else DeadlineExceeded. statusFor reads context.Cause → killed / timeout / cancelled are now distinct (were all "cancelled"). The battery sets killCause from Decision.KillReason on a Kill. Tests: statusFor "killed" case (cause=ErrCriticKill, err=Canceled); fake handle + battery RecordStep/KillCause signatures. Core stays battery-free. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
129 lines
4.6 KiB
Go
129 lines
4.6 KiB
Go
package run_test
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/executus/run"
|
|
"gitea.stevedudenhoeffer.com/steve/executus/tool"
|
|
)
|
|
|
|
type fakeCritic struct{ h *fakeCriticHandle }
|
|
|
|
func (c *fakeCritic) Monitor(_ context.Context, _ run.RunInfo, _ time.Duration) run.CriticHandle {
|
|
return c.h
|
|
}
|
|
|
|
type fakeCriticHandle struct {
|
|
mu sync.Mutex
|
|
steps, tools, stops int
|
|
steered int
|
|
maxSteps int // 0 => defer to the run's base MaxIterations
|
|
killCause error // non-nil simulates a critic kill
|
|
}
|
|
|
|
func (h *fakeCriticHandle) RecordStep(int, *llm.Response) { h.mu.Lock(); h.steps++; h.mu.Unlock() }
|
|
func (h *fakeCriticHandle) KillCause() error {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
return h.killCause
|
|
}
|
|
func (h *fakeCriticHandle) RecordToolStart(string, string) {
|
|
h.mu.Lock()
|
|
h.tools++
|
|
h.mu.Unlock()
|
|
}
|
|
func (h *fakeCriticHandle) Steer() []llm.Message { h.mu.Lock(); h.steered++; h.mu.Unlock(); return nil }
|
|
func (h *fakeCriticHandle) Deadline() time.Time { return time.Time{} } // no hard deadline
|
|
func (h *fakeCriticHandle) MaxSteps() int { h.mu.Lock(); defer h.mu.Unlock(); return h.maxSteps }
|
|
func (h *fakeCriticHandle) Stop() { h.mu.Lock(); h.stops++; h.mu.Unlock() }
|
|
|
|
// TestCriticRaisesStepCeiling: a critic returning a higher MaxSteps lets the agent
|
|
// run PAST its base MaxIterations (the dynamic step ceiling). With base=1 and no
|
|
// critic the run would hit ErrMaxSteps after the first tool-dispatch step; the
|
|
// critic raises it to 5 so the run completes.
|
|
func TestCriticRaisesStepCeiling(t *testing.T) {
|
|
h := &fakeCriticHandle{maxSteps: 5}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m",
|
|
// two tool-call steps (unknown tool → tolerated error results), then answer
|
|
fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c1", Name: "noop", Arguments: []byte(`{}`)}}}),
|
|
fake.ReplyWith(llm.Response{ToolCalls: []llm.ToolCall{{ID: "c2", Name: "noop", Arguments: []byte(`{}`)}}}),
|
|
fake.Reply("done after 2 tool steps"),
|
|
)
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
// large soft timeout so the deadline-watch never interferes in the test
|
|
Defaults: run.Defaults{CriticSoftTimeout: time.Hour},
|
|
})
|
|
res := ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "x", ModelTier: "m", MaxIterations: 1, Critic: run.CriticConfig{Enabled: true}},
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
if res.Err != nil {
|
|
t.Fatalf("critic raised the ceiling to 5, run should complete past base=1: %v", res.Err)
|
|
}
|
|
if res.Output != "done after 2 tool steps" {
|
|
t.Errorf("output = %q", res.Output)
|
|
}
|
|
}
|
|
|
|
// TestCriticWired: an agent with Critic.Enabled gets monitored — Monitor returns
|
|
// a handle the executor feeds (RecordStep), drains (Steer), and stops.
|
|
func TestCriticWired(t *testing.T) {
|
|
h := &fakeCriticHandle{}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("done"))
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
})
|
|
res := ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "watched", ModelTier: "m", Critic: run.CriticConfig{Enabled: true}},
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
if res.Err != nil {
|
|
t.Fatalf("run error: %v", res.Err)
|
|
}
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
if h.steps < 1 {
|
|
t.Errorf("critic should have seen >=1 step, got %d", h.steps)
|
|
}
|
|
if h.steered < 1 {
|
|
t.Errorf("critic Steer should be drained at least once, got %d", h.steered)
|
|
}
|
|
if h.stops != 1 {
|
|
t.Errorf("critic Stop should be called exactly once, got %d", h.stops)
|
|
}
|
|
}
|
|
|
|
// TestCriticDisabledNotMonitored: Critic.Enabled=false → Monitor never called.
|
|
func TestCriticDisabledNotMonitored(t *testing.T) {
|
|
h := &fakeCriticHandle{}
|
|
fp := fake.New("fake")
|
|
fp.Enqueue("m", fake.Reply("done"))
|
|
m, _ := fp.Model("m")
|
|
ex := run.New(run.Config{
|
|
Registry: tool.NewRegistry(),
|
|
Models: func(ctx context.Context, _ string) (context.Context, llm.Model, error) { return ctx, m, nil },
|
|
Ports: run.Ports{Critic: &fakeCritic{h: h}},
|
|
})
|
|
ex.Run(context.Background(),
|
|
run.RunnableAgent{Name: "x", ModelTier: "m"}, // Critic.Enabled=false
|
|
tool.Invocation{RunID: "r"}, "go")
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
if h.stops != 0 || h.steps != 0 {
|
|
t.Errorf("disabled critic should not be monitored: steps=%d stops=%d", h.steps, h.stops)
|
|
}
|
|
}
|