fix(agent): recover front-loaded answer when terminal turn is degenerate

The agent loop took the final answer only from the terminal (no-tool-call) turn. Models that "front-load" their answer into an earlier turn that also calls a tool — then close with a trivial pointer like "(Already answered above.)" — had their real answer discarded and the pointer delivered. This recurs across several open-weight models (glm-5.2, etc.); well-behaved models (Claude/GPT) defer their answer to the terminal turn and are unaffected. finalOutput() now falls back to the last substantive assistant content in the transcript when the terminal text is weak (empty, or a short back-reference). The predicate is narrow and back-reference-gated so short-but-correct answers ("42", "It's down, restarting now.") are never overridden; recovery only picks a prior turn that reads like a real answer, not a preamble. Zero extra model calls. Terminal-answer behavior for normal runs is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 18:37:38 -04:00
parent 74474c6da0
commit 1fd7109a42
3 changed files with 283 additions and 2 deletions
@@ -313,8 +313,10 @@ func (a *Agent) Run(ctx context.Context, input string, opts ...RunOption) (*Resu
 		step := Step{Index: stepIdx, Response: resp}

 		if len(resp.ToolCalls) == 0 {
-			// Final answer.
-			result.Output = resp.Text()
+			// Final answer. Usually this terminal turn's text; but if the model
+			// front-loaded its answer into an earlier tool-call turn and closed
+			// with a trivial pointer, recover that earlier content instead.
+			result.Output = finalOutput(msgs, resp.Text())
 			result.Steps = append(result.Steps, step)
 			result.Messages = msgs
 			a.notify(rc, step)
@@ -0,0 +1,98 @@
+package agent
+
+import (
+	"regexp"
+	"strings"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// finalOutput selects the user-facing answer when the loop reaches a clean
+// terminal turn (one with no tool calls).
+//
+// Normally that terminal turn's text IS the answer: well-behaved models defer
+// their answer to the final, tool-free turn. But some models — notably several
+// open-weight ones — "front-load" their full answer into an earlier turn that
+// ALSO calls a tool (e.g. answer text alongside a citation call), then close
+// with a trivial pointer such as "(Already answered above.)". Returning only
+// the terminal text would discard the real answer, which is still present
+// earlier in the transcript. When the terminal text is weak (empty, or a short
+// back-reference) fall back to the last substantive assistant content in msgs.
+//
+// msgs must already include the terminal assistant message as its last element
+// (the loop appends it before calling this); terminal is that message's text.
+func finalOutput(msgs []llm.Message, terminal string) string {
+	if !isWeakFinal(terminal) {
+		return terminal
+	}
+	if rec, ok := lastSubstantiveAssistantText(msgs, terminal); ok {
+		return rec
+	}
+	return terminal
+}
+
+// backRefRe matches a terminal turn that merely points back to an earlier
+// message instead of stating the answer ("(Already answered above.)",
+// "see above", "as I said", ...).
+var backRefRe = regexp.MustCompile(`(?i)(already answered|see above|as (i )?(said|mentioned|stated|noted)|answered (that )?above|per my (previous|earlier))`)
+
+// preambleRe matches intent-announcing prefixes ("Let me search...", "I'll
+// check...") so a preamble is never mistaken for the answer during recovery.
+var preambleRe = regexp.MustCompile(`(?i)^(let me|let'?s|i'?ll|i will|first[, ]|sure[,. ]|okay[,. ]|on it|checking)`)
+
+const (
+	// weakFinalMaxChars bounds how long a back-reference closer can be. A
+	// genuine final answer that merely contains "as I said" mid-sentence is
+	// longer than this, so it is never treated as weak.
+	weakFinalMaxChars = 120
+	// recoverMinChars: a prior assistant turn this long is treated as a real
+	// answer regardless of how it opens.
+	recoverMinChars = 200
+	// recoverFloorChars / recoverRatio gate the borderline band: a shorter
+	// prior turn must still clearly dwarf the (very short) terminal and not
+	// look like a preamble.
+	recoverFloorChars = 80
+	recoverRatio      = 3
+)
+
+// isWeakFinal reports whether a terminal turn's text fails to stand on its own
+// as the answer: empty/whitespace, or a short pure back-reference.
+func isWeakFinal(s string) bool {
+	t := strings.TrimSpace(s)
+	if t == "" {
+		return true
+	}
+	return len(t) <= weakFinalMaxChars && backRefRe.MatchString(t)
+}
+
+// lastSubstantiveAssistantText scans msgs newest→oldest (skipping the terminal
+// turn and empty tool-only turns) for the most recent assistant turn whose text
+// reads like a real answer. Returns ("", false) when nothing qualifies.
+func lastSubstantiveAssistantText(msgs []llm.Message, terminal string) (string, bool) {
+	tt := strings.TrimSpace(terminal)
+	for i := len(msgs) - 1; i >= 0; i-- {
+		m := msgs[i]
+		if m.Role != llm.RoleAssistant {
+			continue
+		}
+		txt := strings.TrimSpace(m.Text())
+		if txt == "" || txt == tt {
+			continue // the terminal turn itself, or an empty tool-only turn
+		}
+		if isSubstantiveAnswer(txt, tt) {
+			return txt, true
+		}
+	}
+	return "", false
+}
+
+// isSubstantiveAnswer reports whether txt (a prior assistant turn) reads like a
+// real answer rather than a preamble, relative to the terminal text.
+func isSubstantiveAnswer(txt, terminal string) bool {
+	if len(txt) >= recoverMinChars {
+		return true
+	}
+	return len(txt) >= recoverFloorChars &&
+		len(txt) >= recoverRatio*len(terminal) &&
+		!preambleRe.MatchString(txt)
+}
@@ -0,0 +1,181 @@
+package agent
+
+import (
+	"context"
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+)
+
+func TestIsWeakFinal(t *testing.T) {
+	long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said"
+	cases := []struct {
+		name string
+		in   string
+		want bool
+	}{
+		{"empty", "", true},
+		{"whitespace", "   \n\t ", true},
+		{"already-answered", "(Already answered above.)", true},
+		{"see-above", "see above", true},
+		{"as-i-said-short", "As I said, it's 60 minutes.", true},
+		{"crisp-number", "42", false},
+		{"crisp-yes", "Yes.", false},
+		{"crisp-status", "It's down, restarting now.", false},
+		{"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := isWeakFinal(c.in); got != c.want {
+				t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want)
+			}
+		})
+	}
+}
+
+func asst(text string, tools ...llm.ToolCall) llm.Message {
+	m := llm.Message{Role: llm.RoleAssistant}
+	if text != "" {
+		m.Parts = []llm.Part{llm.Text(text)}
+	}
+	m.ToolCalls = tools
+	return m
+}
+
+func TestFinalOutput(t *testing.T) {
+	cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}
+	longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200
+
+	tests := []struct {
+		name     string
+		msgs     []llm.Message
+		terminal string
+		want     string
+	}{
+		{
+			name: "front-loaded answer recovered over back-ref closer",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst(longAnswer, cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("(Already answered above.)"),
+			},
+			terminal: "(Already answered above.)",
+			want:     longAnswer,
+		},
+		{
+			name: "empty terminal recovers prior substantive answer",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst(longAnswer, cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst(""),
+			},
+			terminal: "",
+			want:     longAnswer,
+		},
+		{
+			name: "healthy terminal answer is unchanged",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst("Let me check.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst(longAnswer),
+			},
+			terminal: longAnswer,
+			want:     longAnswer,
+		},
+		{
+			name: "short crisp answer not overridden by a short preamble prior",
+			msgs: []llm.Message{
+				llm.UserText("is it up?"),
+				asst("Let me check the server status.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("It's down, restarting now."),
+			},
+			terminal: "It's down, restarting now.", // not weak → returned as-is
+			want:     "It's down, restarting now.",
+		},
+		{
+			name: "weak terminal but only a preamble prior: no recovery",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst("Let me look that up for you.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("(see above)"),
+			},
+			terminal: "(see above)",
+			want:     "(see above)", // preamble excluded; falls back to terminal
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := finalOutput(tc.msgs, tc.terminal); got != tc.want {
+				t.Errorf("finalOutput = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
+func citeToolbox(t *testing.T) *llm.Toolbox {
+	t.Helper()
+	return llm.NewToolbox("sources", llm.Tool{
+		Name:        "cite",
+		Description: "Record a citation.",
+		Parameters:  json.RawMessage(`{"type":"object","properties":{}}`),
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
+			return map[string]bool{"ok": true}, nil
+		},
+	})
+}
+
+// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a
+// turn carrying the full answer text AND a tool call, then a degenerate
+// terminal turn. The recovered answer must be delivered with no extra model
+// call (zero-cost recovery from the transcript).
+func TestRun_RecoversFrontLoadedAnswer(t *testing.T) {
+	longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6))
+	fp := fake.New("fp")
+	fp.Enqueue("test-model",
+		fake.ReplyWith(llm.Response{
+			Parts:        []llm.Part{llm.Text(longAnswer)},
+			ToolCalls:    []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}},
+			FinishReason: llm.FinishToolCalls,
+			Usage:        llm.Usage{InputTokens: 10, OutputTokens: 5},
+		}),
+		fake.Reply("(Already answered above.)"),
+	)
+
+	a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
+	res, err := a.Run(context.Background(), "is there a meet time limit?")
+	if err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if res.Output != longAnswer {
+		t.Errorf("Output = %q, want recovered front-loaded answer", res.Output)
+	}
+	if n := len(fp.Calls()); n != 2 {
+		t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n)
+	}
+}
+
+// TestRun_HealthyTerminalUnchanged guards against regressing the normal case:
+// a deferred answer in the terminal turn is delivered verbatim.
+func TestRun_HealthyTerminalUnchanged(t *testing.T) {
+	fp := fake.New("fp")
+	fp.Enqueue("test-model",
+		toolCallReply("c1", "cite", `{}`),
+		fake.Reply("The limit is 60 minutes for free group calls."),
+	)
+	a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
+	res, err := a.Run(context.Background(), "q?")
+	if err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if res.Output != "The limit is 60 minutes for free group calls." {
+		t.Errorf("Output = %q, want terminal answer unchanged", res.Output)
+	}
+}