diff --git a/agent/agent.go b/agent/agent.go index 05519ee..fdcc27a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -313,8 +313,10 @@ func (a *Agent) Run(ctx context.Context, input string, opts ...RunOption) (*Resu step := Step{Index: stepIdx, Response: resp} if len(resp.ToolCalls) == 0 { - // Final answer. - result.Output = resp.Text() + // Final answer. Usually this terminal turn's text; but if the model + // front-loaded its answer into an earlier tool-call turn and closed + // with a trivial pointer, recover that earlier content instead. + result.Output = finalOutput(msgs, resp.Text()) result.Steps = append(result.Steps, step) result.Messages = msgs a.notify(rc, step) diff --git a/agent/finalize.go b/agent/finalize.go new file mode 100644 index 0000000..583cc4f --- /dev/null +++ b/agent/finalize.go @@ -0,0 +1,98 @@ +package agent + +import ( + "regexp" + "strings" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" +) + +// finalOutput selects the user-facing answer when the loop reaches a clean +// terminal turn (one with no tool calls). +// +// Normally that terminal turn's text IS the answer: well-behaved models defer +// their answer to the final, tool-free turn. But some models — notably several +// open-weight ones — "front-load" their full answer into an earlier turn that +// ALSO calls a tool (e.g. answer text alongside a citation call), then close +// with a trivial pointer such as "(Already answered above.)". Returning only +// the terminal text would discard the real answer, which is still present +// earlier in the transcript. When the terminal text is weak (empty, or a short +// back-reference) fall back to the last substantive assistant content in msgs. +// +// msgs must already include the terminal assistant message as its last element +// (the loop appends it before calling this); terminal is that message's text. +func finalOutput(msgs []llm.Message, terminal string) string { + if !isWeakFinal(terminal) { + return terminal + } + if rec, ok := lastSubstantiveAssistantText(msgs, terminal); ok { + return rec + } + return terminal +} + +// backRefRe matches a terminal turn that merely points back to an earlier +// message instead of stating the answer ("(Already answered above.)", +// "see above", "as I said", ...). +var backRefRe = regexp.MustCompile(`(?i)(already answered|see above|as (i )?(said|mentioned|stated|noted)|answered (that )?above|per my (previous|earlier))`) + +// preambleRe matches intent-announcing prefixes ("Let me search...", "I'll +// check...") so a preamble is never mistaken for the answer during recovery. +var preambleRe = regexp.MustCompile(`(?i)^(let me|let'?s|i'?ll|i will|first[, ]|sure[,. ]|okay[,. ]|on it|checking)`) + +const ( + // weakFinalMaxChars bounds how long a back-reference closer can be. A + // genuine final answer that merely contains "as I said" mid-sentence is + // longer than this, so it is never treated as weak. + weakFinalMaxChars = 120 + // recoverMinChars: a prior assistant turn this long is treated as a real + // answer regardless of how it opens. + recoverMinChars = 200 + // recoverFloorChars / recoverRatio gate the borderline band: a shorter + // prior turn must still clearly dwarf the (very short) terminal and not + // look like a preamble. + recoverFloorChars = 80 + recoverRatio = 3 +) + +// isWeakFinal reports whether a terminal turn's text fails to stand on its own +// as the answer: empty/whitespace, or a short pure back-reference. +func isWeakFinal(s string) bool { + t := strings.TrimSpace(s) + if t == "" { + return true + } + return len(t) <= weakFinalMaxChars && backRefRe.MatchString(t) +} + +// lastSubstantiveAssistantText scans msgs newest→oldest (skipping the terminal +// turn and empty tool-only turns) for the most recent assistant turn whose text +// reads like a real answer. Returns ("", false) when nothing qualifies. +func lastSubstantiveAssistantText(msgs []llm.Message, terminal string) (string, bool) { + tt := strings.TrimSpace(terminal) + for i := len(msgs) - 1; i >= 0; i-- { + m := msgs[i] + if m.Role != llm.RoleAssistant { + continue + } + txt := strings.TrimSpace(m.Text()) + if txt == "" || txt == tt { + continue // the terminal turn itself, or an empty tool-only turn + } + if isSubstantiveAnswer(txt, tt) { + return txt, true + } + } + return "", false +} + +// isSubstantiveAnswer reports whether txt (a prior assistant turn) reads like a +// real answer rather than a preamble, relative to the terminal text. +func isSubstantiveAnswer(txt, terminal string) bool { + if len(txt) >= recoverMinChars { + return true + } + return len(txt) >= recoverFloorChars && + len(txt) >= recoverRatio*len(terminal) && + !preambleRe.MatchString(txt) +} diff --git a/agent/finalize_test.go b/agent/finalize_test.go new file mode 100644 index 0000000..1a1e367 --- /dev/null +++ b/agent/finalize_test.go @@ -0,0 +1,181 @@ +package agent + +import ( + "context" + "encoding/json" + "strings" + "testing" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" +) + +func TestIsWeakFinal(t *testing.T) { + long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said" + cases := []struct { + name string + in string + want bool + }{ + {"empty", "", true}, + {"whitespace", " \n\t ", true}, + {"already-answered", "(Already answered above.)", true}, + {"see-above", "see above", true}, + {"as-i-said-short", "As I said, it's 60 minutes.", true}, + {"crisp-number", "42", false}, + {"crisp-yes", "Yes.", false}, + {"crisp-status", "It's down, restarting now.", false}, + {"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := isWeakFinal(c.in); got != c.want { + t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want) + } + }) + } +} + +func asst(text string, tools ...llm.ToolCall) llm.Message { + m := llm.Message{Role: llm.RoleAssistant} + if text != "" { + m.Parts = []llm.Part{llm.Text(text)} + } + m.ToolCalls = tools + return m +} + +func TestFinalOutput(t *testing.T) { + cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}} + longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200 + + tests := []struct { + name string + msgs []llm.Message + terminal string + want string + }{ + { + name: "front-loaded answer recovered over back-ref closer", + msgs: []llm.Message{ + llm.UserText("q?"), + asst(longAnswer, cite...), + llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}), + asst("(Already answered above.)"), + }, + terminal: "(Already answered above.)", + want: longAnswer, + }, + { + name: "empty terminal recovers prior substantive answer", + msgs: []llm.Message{ + llm.UserText("q?"), + asst(longAnswer, cite...), + llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}), + asst(""), + }, + terminal: "", + want: longAnswer, + }, + { + name: "healthy terminal answer is unchanged", + msgs: []llm.Message{ + llm.UserText("q?"), + asst("Let me check.", cite...), + llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}), + asst(longAnswer), + }, + terminal: longAnswer, + want: longAnswer, + }, + { + name: "short crisp answer not overridden by a short preamble prior", + msgs: []llm.Message{ + llm.UserText("is it up?"), + asst("Let me check the server status.", cite...), + llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}), + asst("It's down, restarting now."), + }, + terminal: "It's down, restarting now.", // not weak → returned as-is + want: "It's down, restarting now.", + }, + { + name: "weak terminal but only a preamble prior: no recovery", + msgs: []llm.Message{ + llm.UserText("q?"), + asst("Let me look that up for you.", cite...), + llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}), + asst("(see above)"), + }, + terminal: "(see above)", + want: "(see above)", // preamble excluded; falls back to terminal + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if got := finalOutput(tc.msgs, tc.terminal); got != tc.want { + t.Errorf("finalOutput = %q, want %q", got, tc.want) + } + }) + } +} + +func citeToolbox(t *testing.T) *llm.Toolbox { + t.Helper() + return llm.NewToolbox("sources", llm.Tool{ + Name: "cite", + Description: "Record a citation.", + Parameters: json.RawMessage(`{"type":"object","properties":{}}`), + Handler: func(_ context.Context, _ json.RawMessage) (any, error) { + return map[string]bool{"ok": true}, nil + }, + }) +} + +// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a +// turn carrying the full answer text AND a tool call, then a degenerate +// terminal turn. The recovered answer must be delivered with no extra model +// call (zero-cost recovery from the transcript). +func TestRun_RecoversFrontLoadedAnswer(t *testing.T) { + longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) + fp := fake.New("fp") + fp.Enqueue("test-model", + fake.ReplyWith(llm.Response{ + Parts: []llm.Part{llm.Text(longAnswer)}, + ToolCalls: []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}, + FinishReason: llm.FinishToolCalls, + Usage: llm.Usage{InputTokens: 10, OutputTokens: 5}, + }), + fake.Reply("(Already answered above.)"), + ) + + a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t))) + res, err := a.Run(context.Background(), "is there a meet time limit?") + if err != nil { + t.Fatalf("Run: %v", err) + } + if res.Output != longAnswer { + t.Errorf("Output = %q, want recovered front-loaded answer", res.Output) + } + if n := len(fp.Calls()); n != 2 { + t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n) + } +} + +// TestRun_HealthyTerminalUnchanged guards against regressing the normal case: +// a deferred answer in the terminal turn is delivered verbatim. +func TestRun_HealthyTerminalUnchanged(t *testing.T) { + fp := fake.New("fp") + fp.Enqueue("test-model", + toolCallReply("c1", "cite", `{}`), + fake.Reply("The limit is 60 minutes for free group calls."), + ) + a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t))) + res, err := a.Run(context.Background(), "q?") + if err != nil { + t.Fatalf("Run: %v", err) + } + if res.Output != "The limit is 60 minutes for free group calls." { + t.Errorf("Output = %q, want terminal answer unchanged", res.Output) + } +}