Files
majordomo/agent/finalize_test.go
steve 1fd7109a42
CI / Tidy (pull_request) Successful in 9m31s
CI / Build & Test (pull_request) Successful in 10m14s
CI / Tidy (push) Successful in 9m26s
CI / Build & Test (push) Successful in 10m19s
fix(agent): recover front-loaded answer when terminal turn is degenerate
The agent loop took the final answer only from the terminal (no-tool-call)
turn. Models that "front-load" their answer into an earlier turn that also
calls a tool — then close with a trivial pointer like "(Already answered
above.)" — had their real answer discarded and the pointer delivered. This
recurs across several open-weight models (glm-5.2, etc.); well-behaved models
(Claude/GPT) defer their answer to the terminal turn and are unaffected.

finalOutput() now falls back to the last substantive assistant content in the
transcript when the terminal text is weak (empty, or a short back-reference).
The predicate is narrow and back-reference-gated so short-but-correct answers
("42", "It's down, restarting now.") are never overridden; recovery only picks
a prior turn that reads like a real answer, not a preamble. Zero extra model
calls. Terminal-answer behavior for normal runs is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 18:37:38 -04:00

182 lines
5.6 KiB
Go

package agent
import (
"context"
"encoding/json"
"strings"
"testing"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)
func TestIsWeakFinal(t *testing.T) {
long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said"
cases := []struct {
name string
in string
want bool
}{
{"empty", "", true},
{"whitespace", " \n\t ", true},
{"already-answered", "(Already answered above.)", true},
{"see-above", "see above", true},
{"as-i-said-short", "As I said, it's 60 minutes.", true},
{"crisp-number", "42", false},
{"crisp-yes", "Yes.", false},
{"crisp-status", "It's down, restarting now.", false},
{"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if got := isWeakFinal(c.in); got != c.want {
t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want)
}
})
}
}
func asst(text string, tools ...llm.ToolCall) llm.Message {
m := llm.Message{Role: llm.RoleAssistant}
if text != "" {
m.Parts = []llm.Part{llm.Text(text)}
}
m.ToolCalls = tools
return m
}
func TestFinalOutput(t *testing.T) {
cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200
tests := []struct {
name string
msgs []llm.Message
terminal string
want string
}{
{
name: "front-loaded answer recovered over back-ref closer",
msgs: []llm.Message{
llm.UserText("q?"),
asst(longAnswer, cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("(Already answered above.)"),
},
terminal: "(Already answered above.)",
want: longAnswer,
},
{
name: "empty terminal recovers prior substantive answer",
msgs: []llm.Message{
llm.UserText("q?"),
asst(longAnswer, cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst(""),
},
terminal: "",
want: longAnswer,
},
{
name: "healthy terminal answer is unchanged",
msgs: []llm.Message{
llm.UserText("q?"),
asst("Let me check.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst(longAnswer),
},
terminal: longAnswer,
want: longAnswer,
},
{
name: "short crisp answer not overridden by a short preamble prior",
msgs: []llm.Message{
llm.UserText("is it up?"),
asst("Let me check the server status.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("It's down, restarting now."),
},
terminal: "It's down, restarting now.", // not weak → returned as-is
want: "It's down, restarting now.",
},
{
name: "weak terminal but only a preamble prior: no recovery",
msgs: []llm.Message{
llm.UserText("q?"),
asst("Let me look that up for you.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("(see above)"),
},
terminal: "(see above)",
want: "(see above)", // preamble excluded; falls back to terminal
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
if got := finalOutput(tc.msgs, tc.terminal); got != tc.want {
t.Errorf("finalOutput = %q, want %q", got, tc.want)
}
})
}
}
func citeToolbox(t *testing.T) *llm.Toolbox {
t.Helper()
return llm.NewToolbox("sources", llm.Tool{
Name: "cite",
Description: "Record a citation.",
Parameters: json.RawMessage(`{"type":"object","properties":{}}`),
Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
return map[string]bool{"ok": true}, nil
},
})
}
// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a
// turn carrying the full answer text AND a tool call, then a degenerate
// terminal turn. The recovered answer must be delivered with no extra model
// call (zero-cost recovery from the transcript).
func TestRun_RecoversFrontLoadedAnswer(t *testing.T) {
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6))
fp := fake.New("fp")
fp.Enqueue("test-model",
fake.ReplyWith(llm.Response{
Parts: []llm.Part{llm.Text(longAnswer)},
ToolCalls: []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}},
FinishReason: llm.FinishToolCalls,
Usage: llm.Usage{InputTokens: 10, OutputTokens: 5},
}),
fake.Reply("(Already answered above.)"),
)
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
res, err := a.Run(context.Background(), "is there a meet time limit?")
if err != nil {
t.Fatalf("Run: %v", err)
}
if res.Output != longAnswer {
t.Errorf("Output = %q, want recovered front-loaded answer", res.Output)
}
if n := len(fp.Calls()); n != 2 {
t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n)
}
}
// TestRun_HealthyTerminalUnchanged guards against regressing the normal case:
// a deferred answer in the terminal turn is delivered verbatim.
func TestRun_HealthyTerminalUnchanged(t *testing.T) {
fp := fake.New("fp")
fp.Enqueue("test-model",
toolCallReply("c1", "cite", `{}`),
fake.Reply("The limit is 60 minutes for free group calls."),
)
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
res, err := a.Run(context.Background(), "q?")
if err != nil {
t.Fatalf("Run: %v", err)
}
if res.Output != "The limit is 60 minutes for free group calls." {
t.Errorf("Output = %q, want terminal answer unchanged", res.Output)
}
}