0147a79d18
Phase 9a (ADR-0014): Registry.RegisterResolver for dynamic tiers; DefineTool[Args] typed tools; Usage cache/reasoning detail fields wired through anthropic/openai/google; WithPromptCaching (Anthropic cache_control); agent supervision hooks (WithMaxStepsFunc, WithSteer, WithCompactor, WithToolErrorLimits + ErrToolLoop); health Bench/Unbench/Snapshot; ChainConfig.Observer failover events. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
176 lines
5.7 KiB
Go
176 lines
5.7 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"strings"
|
|
"sync/atomic"
|
|
"testing"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
|
)
|
|
|
|
// TestMaxStepsFuncExtendsBudget: a supervisor raising the ceiling mid-run
|
|
// lets the loop continue past the static budget.
|
|
func TestMaxStepsFuncExtendsBudget(t *testing.T) {
|
|
fp := fake.New("fp")
|
|
fp.Enqueue("test-model",
|
|
toolCallReply("c1", "add", `{"a":1,"b":1}`),
|
|
toolCallReply("c2", "add", `{"a":2,"b":2}`),
|
|
toolCallReply("c3", "add", `{"a":3,"b":3}`),
|
|
fake.Reply("done"),
|
|
)
|
|
|
|
var ceiling atomic.Int64
|
|
ceiling.Store(2)
|
|
a := New(newModel(t, fp), "",
|
|
WithToolbox(adderToolbox(t)),
|
|
WithMaxSteps(2),
|
|
WithMaxStepsFunc(func() int { return int(ceiling.Load()) }),
|
|
WithStepObserver(func(s Step) {
|
|
if s.Index == 1 {
|
|
ceiling.Store(10) // the "critic" extends the budget
|
|
}
|
|
}),
|
|
)
|
|
res, err := a.Run(context.Background(), "go")
|
|
if err != nil {
|
|
t.Fatalf("Run: %v (budget should have been extended)", err)
|
|
}
|
|
if res.Output != "done" || len(res.Steps) != 4 {
|
|
t.Errorf("output=%q steps=%d", res.Output, len(res.Steps))
|
|
}
|
|
}
|
|
|
|
// TestSteerInjectsMessages: steering messages appear in the conversation
|
|
// before the next model call.
|
|
func TestSteerInjectsMessages(t *testing.T) {
|
|
fp := fake.New("fp")
|
|
fp.Enqueue("test-model",
|
|
toolCallReply("c1", "add", `{"a":1,"b":1}`),
|
|
fake.Reply("ok"),
|
|
)
|
|
|
|
var pending []llm.Message
|
|
pending = append(pending, llm.UserText("SUPERVISOR: wrap it up"))
|
|
a := New(newModel(t, fp), "", WithToolbox(adderToolbox(t)))
|
|
_, err := a.Run(context.Background(), "go", WithSteer(func() []llm.Message {
|
|
out := pending
|
|
pending = nil
|
|
return out
|
|
}))
|
|
if err != nil {
|
|
t.Fatalf("Run: %v", err)
|
|
}
|
|
first := fp.Calls()[0].Request.Messages
|
|
if len(first) != 2 || !strings.Contains(first[1].Text(), "SUPERVISOR") {
|
|
t.Errorf("first call messages = %+v, want steered message", first)
|
|
}
|
|
// Drained: second call must not duplicate it.
|
|
second := fp.Calls()[1].Request.Messages
|
|
count := 0
|
|
for _, m := range second {
|
|
if strings.Contains(m.Text(), "SUPERVISOR") {
|
|
count++
|
|
}
|
|
}
|
|
if count != 1 {
|
|
t.Errorf("steer message appears %d times in second call, want 1", count)
|
|
}
|
|
}
|
|
|
|
// TestCompactorShrinksOutboundContext: the model sees the compacted view;
|
|
// the canonical transcript keeps everything.
|
|
func TestCompactorShrinksOutboundContext(t *testing.T) {
|
|
fp := fake.New("fp")
|
|
fp.Enqueue("test-model", fake.Reply("answer"))
|
|
|
|
history := []llm.Message{
|
|
llm.UserText("old 1"), llm.AssistantText("old reply 1"),
|
|
llm.UserText("old 2"), llm.AssistantText("old reply 2"),
|
|
}
|
|
a := New(newModel(t, fp), "", WithCompactor(func(_ context.Context, msgs []llm.Message) ([]llm.Message, error) {
|
|
// Keep only the last message, prefixed by a synthetic summary.
|
|
return append([]llm.Message{llm.UserText("[summary of earlier conversation]")}, msgs[len(msgs)-1]), nil
|
|
}))
|
|
res, err := a.Run(context.Background(), "new question", WithHistory(history))
|
|
if err != nil {
|
|
t.Fatalf("Run: %v", err)
|
|
}
|
|
sent := fp.Calls()[0].Request.Messages
|
|
if len(sent) != 2 || !strings.Contains(sent[0].Text(), "summary") {
|
|
t.Errorf("sent = %+v, want compacted view", sent)
|
|
}
|
|
if len(res.Messages) != 6 {
|
|
t.Errorf("transcript = %d messages, want full uncompacted history", len(res.Messages))
|
|
}
|
|
}
|
|
|
|
// TestCompactorErrorIsNonFatal: a failing compactor falls back to the
|
|
// original messages.
|
|
func TestCompactorErrorIsNonFatal(t *testing.T) {
|
|
fp := fake.New("fp")
|
|
fp.Enqueue("test-model", fake.Reply("fine"))
|
|
|
|
a := New(newModel(t, fp), "", WithCompactor(func(context.Context, []llm.Message) ([]llm.Message, error) {
|
|
return nil, errors.New("summarizer down")
|
|
}))
|
|
res, err := a.Run(context.Background(), "go")
|
|
if err != nil || res.Output != "fine" {
|
|
t.Errorf("res=%v err=%v", res, err)
|
|
}
|
|
if len(fp.Calls()[0].Request.Messages) != 1 {
|
|
t.Error("original messages must be sent when compaction fails")
|
|
}
|
|
}
|
|
|
|
// TestConsecutiveToolErrorGuard: steps whose tools ALL fail trip the guard.
|
|
func TestConsecutiveToolErrorGuard(t *testing.T) {
|
|
fp := fake.New("fp", fake.WithDefault(func(string, llm.Request) fake.Step {
|
|
return toolCallReply("c", "bomb", `{}`)
|
|
}))
|
|
bomb := llm.NewToolbox("danger", llm.Tool{
|
|
Name: "bomb",
|
|
Handler: func(context.Context, json.RawMessage) (any, error) { return nil, errors.New("always fails") },
|
|
})
|
|
|
|
a := New(newModel(t, fp), "", WithToolbox(bomb), WithToolErrorLimits(2, 0), WithMaxSteps(10))
|
|
res, err := a.Run(context.Background(), "go")
|
|
if !errors.Is(err, ErrToolLoop) {
|
|
t.Fatalf("err = %v, want ErrToolLoop", err)
|
|
}
|
|
if len(res.Steps) != 2 {
|
|
t.Errorf("steps = %d, want guard to trip after 2", len(res.Steps))
|
|
}
|
|
}
|
|
|
|
// TestSameCallRepeatGuard: identical (name+args) calls beyond the limit
|
|
// trip the guard; varied calls do not.
|
|
func TestSameCallRepeatGuard(t *testing.T) {
|
|
fp := fake.New("fp", fake.WithDefault(func(string, llm.Request) fake.Step {
|
|
return toolCallReply("c", "add", `{"a":1,"b":1}`)
|
|
}))
|
|
|
|
a := New(newModel(t, fp), "", WithToolbox(adderToolbox(t)), WithToolErrorLimits(0, 3), WithMaxSteps(10))
|
|
_, err := a.Run(context.Background(), "go")
|
|
if !errors.Is(err, ErrToolLoop) || !strings.Contains(err.Error(), `"add"`) {
|
|
t.Fatalf("err = %v, want repeat-guard ErrToolLoop naming add", err)
|
|
}
|
|
|
|
// Varied arguments never trip it.
|
|
n := 0
|
|
fp2 := fake.New("fp", fake.WithDefault(func(string, llm.Request) fake.Step {
|
|
n++
|
|
if n > 4 {
|
|
return fake.Reply("done")
|
|
}
|
|
return toolCallReply("c", "add", `{"a":1,"b":`+string(rune('0'+n))+`}`)
|
|
}))
|
|
a2 := New(newModel(t, fp2), "", WithToolbox(adderToolbox(t)), WithToolErrorLimits(0, 3), WithMaxSteps(10))
|
|
if _, err := a2.Run(context.Background(), "go"); err != nil {
|
|
t.Errorf("varied calls must not trip the guard: %v", err)
|
|
}
|
|
}
|