fix(agent): recover front-loaded answer when terminal turn is degenerate #1

Merged
steve merged 1 commits from fix/agent-front-loaded-final-output into main 2026-06-26 22:48:10 +00:00
3 changed files with 283 additions and 2 deletions
+4 -2
View File
@@ -313,8 +313,10 @@ func (a *Agent) Run(ctx context.Context, input string, opts ...RunOption) (*Resu
step := Step{Index: stepIdx, Response: resp}
if len(resp.ToolCalls) == 0 {
// Final answer.
result.Output = resp.Text()
// Final answer. Usually this terminal turn's text; but if the model
// front-loaded its answer into an earlier tool-call turn and closed
// with a trivial pointer, recover that earlier content instead.
result.Output = finalOutput(msgs, resp.Text())
result.Steps = append(result.Steps, step)
result.Messages = msgs
a.notify(rc, step)
+98
View File
@@ -0,0 +1,98 @@
package agent
import (
"regexp"
"strings"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// finalOutput selects the user-facing answer when the loop reaches a clean
// terminal turn (one with no tool calls).
//
// Normally that terminal turn's text IS the answer: well-behaved models defer
// their answer to the final, tool-free turn. But some models — notably several
// open-weight ones — "front-load" their full answer into an earlier turn that
// ALSO calls a tool (e.g. answer text alongside a citation call), then close
// with a trivial pointer such as "(Already answered above.)". Returning only
// the terminal text would discard the real answer, which is still present
// earlier in the transcript. When the terminal text is weak (empty, or a short
// back-reference) fall back to the last substantive assistant content in msgs.
//
// msgs must already include the terminal assistant message as its last element
// (the loop appends it before calling this); terminal is that message's text.
func finalOutput(msgs []llm.Message, terminal string) string {
if !isWeakFinal(terminal) {
return terminal
}
if rec, ok := lastSubstantiveAssistantText(msgs, terminal); ok {
return rec
}
return terminal
}
// backRefRe matches a terminal turn that merely points back to an earlier
// message instead of stating the answer ("(Already answered above.)",
// "see above", "as I said", ...).
var backRefRe = regexp.MustCompile(`(?i)(already answered|see above|as (i )?(said|mentioned|stated|noted)|answered (that )?above|per my (previous|earlier))`)
// preambleRe matches intent-announcing prefixes ("Let me search...", "I'll
// check...") so a preamble is never mistaken for the answer during recovery.
var preambleRe = regexp.MustCompile(`(?i)^(let me|let'?s|i'?ll|i will|first[, ]|sure[,. ]|okay[,. ]|on it|checking)`)
const (
// weakFinalMaxChars bounds how long a back-reference closer can be. A
// genuine final answer that merely contains "as I said" mid-sentence is
// longer than this, so it is never treated as weak.
weakFinalMaxChars = 120
// recoverMinChars: a prior assistant turn this long is treated as a real
// answer regardless of how it opens.
recoverMinChars = 200
// recoverFloorChars / recoverRatio gate the borderline band: a shorter
// prior turn must still clearly dwarf the (very short) terminal and not
// look like a preamble.
recoverFloorChars = 80
recoverRatio = 3
)
// isWeakFinal reports whether a terminal turn's text fails to stand on its own
// as the answer: empty/whitespace, or a short pure back-reference.
func isWeakFinal(s string) bool {
t := strings.TrimSpace(s)
if t == "" {
return true
}
return len(t) <= weakFinalMaxChars && backRefRe.MatchString(t)
}
// lastSubstantiveAssistantText scans msgs newest→oldest (skipping the terminal
// turn and empty tool-only turns) for the most recent assistant turn whose text
// reads like a real answer. Returns ("", false) when nothing qualifies.
func lastSubstantiveAssistantText(msgs []llm.Message, terminal string) (string, bool) {
tt := strings.TrimSpace(terminal)
for i := len(msgs) - 1; i >= 0; i-- {
m := msgs[i]
if m.Role != llm.RoleAssistant {
continue
}
txt := strings.TrimSpace(m.Text())
if txt == "" || txt == tt {
continue // the terminal turn itself, or an empty tool-only turn
}
if isSubstantiveAnswer(txt, tt) {
return txt, true
}
}
return "", false
}
// isSubstantiveAnswer reports whether txt (a prior assistant turn) reads like a
// real answer rather than a preamble, relative to the terminal text.
func isSubstantiveAnswer(txt, terminal string) bool {
if len(txt) >= recoverMinChars {
return true
}
return len(txt) >= recoverFloorChars &&
len(txt) >= recoverRatio*len(terminal) &&
!preambleRe.MatchString(txt)
}
+181
View File
@@ -0,0 +1,181 @@
package agent
import (
"context"
"encoding/json"
"strings"
"testing"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)
func TestIsWeakFinal(t *testing.T) {
long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said"
cases := []struct {
name string
in string
want bool
}{
{"empty", "", true},
{"whitespace", " \n\t ", true},
{"already-answered", "(Already answered above.)", true},
{"see-above", "see above", true},
{"as-i-said-short", "As I said, it's 60 minutes.", true},
{"crisp-number", "42", false},
{"crisp-yes", "Yes.", false},
{"crisp-status", "It's down, restarting now.", false},
{"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if got := isWeakFinal(c.in); got != c.want {
t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want)
}
})
}
}
func asst(text string, tools ...llm.ToolCall) llm.Message {
m := llm.Message{Role: llm.RoleAssistant}
if text != "" {
m.Parts = []llm.Part{llm.Text(text)}
}
m.ToolCalls = tools
return m
}
func TestFinalOutput(t *testing.T) {
cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200
tests := []struct {
name string
msgs []llm.Message
terminal string
want string
}{
{
name: "front-loaded answer recovered over back-ref closer",
msgs: []llm.Message{
llm.UserText("q?"),
asst(longAnswer, cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("(Already answered above.)"),
},
terminal: "(Already answered above.)",
want: longAnswer,
},
{
name: "empty terminal recovers prior substantive answer",
msgs: []llm.Message{
llm.UserText("q?"),
asst(longAnswer, cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst(""),
},
terminal: "",
want: longAnswer,
},
{
name: "healthy terminal answer is unchanged",
msgs: []llm.Message{
llm.UserText("q?"),
asst("Let me check.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst(longAnswer),
},
terminal: longAnswer,
want: longAnswer,
},
{
name: "short crisp answer not overridden by a short preamble prior",
msgs: []llm.Message{
llm.UserText("is it up?"),
asst("Let me check the server status.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("It's down, restarting now."),
},
terminal: "It's down, restarting now.", // not weak → returned as-is
want: "It's down, restarting now.",
},
{
name: "weak terminal but only a preamble prior: no recovery",
msgs: []llm.Message{
llm.UserText("q?"),
asst("Let me look that up for you.", cite...),
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
asst("(see above)"),
},
terminal: "(see above)",
want: "(see above)", // preamble excluded; falls back to terminal
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
if got := finalOutput(tc.msgs, tc.terminal); got != tc.want {
t.Errorf("finalOutput = %q, want %q", got, tc.want)
}
})
}
}
func citeToolbox(t *testing.T) *llm.Toolbox {
t.Helper()
return llm.NewToolbox("sources", llm.Tool{
Name: "cite",
Description: "Record a citation.",
Parameters: json.RawMessage(`{"type":"object","properties":{}}`),
Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
return map[string]bool{"ok": true}, nil
},
})
}
// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a
// turn carrying the full answer text AND a tool call, then a degenerate
// terminal turn. The recovered answer must be delivered with no extra model
// call (zero-cost recovery from the transcript).
func TestRun_RecoversFrontLoadedAnswer(t *testing.T) {
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6))
fp := fake.New("fp")
fp.Enqueue("test-model",
fake.ReplyWith(llm.Response{
Parts: []llm.Part{llm.Text(longAnswer)},
ToolCalls: []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}},
FinishReason: llm.FinishToolCalls,
Usage: llm.Usage{InputTokens: 10, OutputTokens: 5},
}),
fake.Reply("(Already answered above.)"),
)
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
res, err := a.Run(context.Background(), "is there a meet time limit?")
if err != nil {
t.Fatalf("Run: %v", err)
}
if res.Output != longAnswer {
t.Errorf("Output = %q, want recovered front-loaded answer", res.Output)
}
if n := len(fp.Calls()); n != 2 {
t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n)
}
}
// TestRun_HealthyTerminalUnchanged guards against regressing the normal case:
// a deferred answer in the terminal turn is delivered verbatim.
func TestRun_HealthyTerminalUnchanged(t *testing.T) {
fp := fake.New("fp")
fp.Enqueue("test-model",
toolCallReply("c1", "cite", `{}`),
fake.Reply("The limit is 60 minutes for free group calls."),
)
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
res, err := a.Run(context.Background(), "q?")
if err != nil {
t.Fatalf("Run: %v", err)
}
if res.Output != "The limit is 60 minutes for free group calls." {
t.Errorf("Output = %q, want terminal answer unchanged", res.Output)
}
}