2026-06-26 22:48:10 +00:00
3 changed files with 283 additions and 2 deletions
@@ -313,8 +313,10 @@ func (a *Agent) Run(ctx context.Context, input string, opts ...RunOption) (*Resu
 		step := Step{Index: stepIdx, Response: resp}

 		if len(resp.ToolCalls) == 0 {
-			// Final answer.
-			result.Output = resp.Text()
+			// Final answer. Usually this terminal turn's text; but if the model
+			// front-loaded its answer into an earlier tool-call turn and closed
+			// with a trivial pointer, recover that earlier content instead.
+			result.Output = finalOutput(msgs, resp.Text())
 			result.Steps = append(result.Steps, step)
 			result.Messages = msgs
 			a.notify(rc, step)
@@ -0,0 +1,98 @@
+package agent
+
+import (
+	"regexp"
+	"strings"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// finalOutput selects the user-facing answer when the loop reaches a clean
+// terminal turn (one with no tool calls).
+//
+// Normally that terminal turn's text IS the answer: well-behaved models defer
+// their answer to the final, tool-free turn. But some models — notably several
+// open-weight ones — "front-load" their full answer into an earlier turn that
+// ALSO calls a tool (e.g. answer text alongside a citation call), then close
+// with a trivial pointer such as "(Already answered above.)". Returning only
+// the terminal text would discard the real answer, which is still present
+// earlier in the transcript. When the terminal text is weak (empty, or a short
+// back-reference) fall back to the last substantive assistant content in msgs.
+//
+// msgs must already include the terminal assistant message as its last element
+// (the loop appends it before calling this); terminal is that message's text.
+func finalOutput(msgs []llm.Message, terminal string) string {
+	if !isWeakFinal(terminal) {
+		return terminal
+	}
+	if rec, ok := lastSubstantiveAssistantText(msgs, terminal); ok {
+		return rec
+	}
+	return terminal
+}
+
+// backRefRe matches a terminal turn that merely points back to an earlier
+// message instead of stating the answer ("(Already answered above.)",
+// "see above", "as I said", ...).
+var backRefRe = regexp.MustCompile(`(?i)(already answered|see above|as (i )?(said|mentioned|stated|noted)|answered (that )?above|per my (previous|earlier))`)
+
+// preambleRe matches intent-announcing prefixes ("Let me search...", "I'll
+// check...") so a preamble is never mistaken for the answer during recovery.
+var preambleRe = regexp.MustCompile(`(?i)^(let me|let'?s|i'?ll|i will|first[, ]|sure[,. ]|okay[,. ]|on it|checking)`)
+
+const (
+	// weakFinalMaxChars bounds how long a back-reference closer can be. A
+	// genuine final answer that merely contains "as I said" mid-sentence is
+	// longer than this, so it is never treated as weak.
+	weakFinalMaxChars = 120
+	// recoverMinChars: a prior assistant turn this long is treated as a real
+	// answer regardless of how it opens.
+	recoverMinChars = 200
+	// recoverFloorChars / recoverRatio gate the borderline band: a shorter
+	// prior turn must still clearly dwarf the (very short) terminal and not
+	// look like a preamble.
+	recoverFloorChars = 80
+	recoverRatio      = 3
+)
+
+// isWeakFinal reports whether a terminal turn's text fails to stand on its own
+// as the answer: empty/whitespace, or a short pure back-reference.
+func isWeakFinal(s string) bool {
+	t := strings.TrimSpace(s)
+	if t == "" {
+		return true
+	}
+	return len(t) <= weakFinalMaxChars && backRefRe.MatchString(t)
+}
+
+// lastSubstantiveAssistantText scans msgs newest→oldest (skipping the terminal
+// turn and empty tool-only turns) for the most recent assistant turn whose text
+// reads like a real answer. Returns ("", false) when nothing qualifies.
+func lastSubstantiveAssistantText(msgs []llm.Message, terminal string) (string, bool) {
+	tt := strings.TrimSpace(terminal)
+	for i := len(msgs) - 1; i >= 0; i-- {
+		m := msgs[i]
+		if m.Role != llm.RoleAssistant {
+			continue
+		}
+		txt := strings.TrimSpace(m.Text())
+		if txt == "" || txt == tt {
+			continue // the terminal turn itself, or an empty tool-only turn
+		}
+		if isSubstantiveAnswer(txt, tt) {
+			return txt, true
+		}
+	}
+	return "", false
+}
+
+// isSubstantiveAnswer reports whether txt (a prior assistant turn) reads like a
+// real answer rather than a preamble, relative to the terminal text.
+func isSubstantiveAnswer(txt, terminal string) bool {
+	if len(txt) >= recoverMinChars {
+		return true
+	}
+	return len(txt) >= recoverFloorChars &&
+		len(txt) >= recoverRatio*len(terminal) &&
+		!preambleRe.MatchString(txt)
+}
@@ -0,0 +1,181 @@
+package agent
+
+import (
+	"context"
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+)
+
+func TestIsWeakFinal(t *testing.T) {
+	long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said"
+	cases := []struct {
+		name string
+		in   string
+		want bool
+	}{
+		{"empty", "", true},
+		{"whitespace", "   \n\t ", true},
+		{"already-answered", "(Already answered above.)", true},
+		{"see-above", "see above", true},
+		{"as-i-said-short", "As I said, it's 60 minutes.", true},
+		{"crisp-number", "42", false},
+		{"crisp-yes", "Yes.", false},
+		{"crisp-status", "It's down, restarting now.", false},
+		{"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := isWeakFinal(c.in); got != c.want {
+				t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want)
+			}
+		})
+	}
+}
+
+func asst(text string, tools ...llm.ToolCall) llm.Message {
+	m := llm.Message{Role: llm.RoleAssistant}
+	if text != "" {
+		m.Parts = []llm.Part{llm.Text(text)}
+	}
+	m.ToolCalls = tools
+	return m
+}
+
+func TestFinalOutput(t *testing.T) {
+	cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}
+	longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200
+
+	tests := []struct {
+		name     string
+		msgs     []llm.Message
+		terminal string
+		want     string
+	}{
+		{
+			name: "front-loaded answer recovered over back-ref closer",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst(longAnswer, cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("(Already answered above.)"),
+			},
+			terminal: "(Already answered above.)",
+			want:     longAnswer,
+		},
+		{
+			name: "empty terminal recovers prior substantive answer",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst(longAnswer, cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst(""),
+			},
+			terminal: "",
+			want:     longAnswer,
+		},
+		{
+			name: "healthy terminal answer is unchanged",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst("Let me check.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst(longAnswer),
+			},
+			terminal: longAnswer,
+			want:     longAnswer,
+		},
+		{
+			name: "short crisp answer not overridden by a short preamble prior",
+			msgs: []llm.Message{
+				llm.UserText("is it up?"),
+				asst("Let me check the server status.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("It's down, restarting now."),
+			},
+			terminal: "It's down, restarting now.", // not weak → returned as-is
+			want:     "It's down, restarting now.",
+		},
+		{
+			name: "weak terminal but only a preamble prior: no recovery",
+			msgs: []llm.Message{
+				llm.UserText("q?"),
+				asst("Let me look that up for you.", cite...),
+				llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
+				asst("(see above)"),
+			},
+			terminal: "(see above)",
+			want:     "(see above)", // preamble excluded; falls back to terminal
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := finalOutput(tc.msgs, tc.terminal); got != tc.want {
+				t.Errorf("finalOutput = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
+func citeToolbox(t *testing.T) *llm.Toolbox {
+	t.Helper()
+	return llm.NewToolbox("sources", llm.Tool{
+		Name:        "cite",
+		Description: "Record a citation.",
+		Parameters:  json.RawMessage(`{"type":"object","properties":{}}`),
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
+			return map[string]bool{"ok": true}, nil
+		},
+	})
+}
+
+// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a
+// turn carrying the full answer text AND a tool call, then a degenerate
+// terminal turn. The recovered answer must be delivered with no extra model
+// call (zero-cost recovery from the transcript).
+func TestRun_RecoversFrontLoadedAnswer(t *testing.T) {
+	longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6))
+	fp := fake.New("fp")
+	fp.Enqueue("test-model",
+		fake.ReplyWith(llm.Response{
+			Parts:        []llm.Part{llm.Text(longAnswer)},
+			ToolCalls:    []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}},
+			FinishReason: llm.FinishToolCalls,
+			Usage:        llm.Usage{InputTokens: 10, OutputTokens: 5},
+		}),
+		fake.Reply("(Already answered above.)"),
+	)
+
+	a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
+	res, err := a.Run(context.Background(), "is there a meet time limit?")
+	if err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if res.Output != longAnswer {
+		t.Errorf("Output = %q, want recovered front-loaded answer", res.Output)
+	}
+	if n := len(fp.Calls()); n != 2 {
+		t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n)
+	}
+}
+
+// TestRun_HealthyTerminalUnchanged guards against regressing the normal case:
+// a deferred answer in the terminal turn is delivered verbatim.
+func TestRun_HealthyTerminalUnchanged(t *testing.T) {
+	fp := fake.New("fp")
+	fp.Enqueue("test-model",
+		toolCallReply("c1", "cite", `{}`),
+		fake.Reply("The limit is 60 minutes for free group calls."),
+	)
+	a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
+	res, err := a.Run(context.Background(), "q?")
+	if err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if res.Output != "The limit is 60 minutes for free group calls." {
+		t.Errorf("Output = %q, want terminal answer unchanged", res.Output)
+	}
+}