fix(agent): recover front-loaded answer when terminal turn is degenerate #1
+4
-2
@@ -313,8 +313,10 @@ func (a *Agent) Run(ctx context.Context, input string, opts ...RunOption) (*Resu
|
||||
step := Step{Index: stepIdx, Response: resp}
|
||||
|
||||
if len(resp.ToolCalls) == 0 {
|
||||
// Final answer.
|
||||
result.Output = resp.Text()
|
||||
// Final answer. Usually this terminal turn's text; but if the model
|
||||
// front-loaded its answer into an earlier tool-call turn and closed
|
||||
// with a trivial pointer, recover that earlier content instead.
|
||||
result.Output = finalOutput(msgs, resp.Text())
|
||||
result.Steps = append(result.Steps, step)
|
||||
result.Messages = msgs
|
||||
a.notify(rc, step)
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
)
|
||||
|
||||
// finalOutput selects the user-facing answer when the loop reaches a clean
|
||||
// terminal turn (one with no tool calls).
|
||||
//
|
||||
// Normally that terminal turn's text IS the answer: well-behaved models defer
|
||||
// their answer to the final, tool-free turn. But some models — notably several
|
||||
// open-weight ones — "front-load" their full answer into an earlier turn that
|
||||
// ALSO calls a tool (e.g. answer text alongside a citation call), then close
|
||||
// with a trivial pointer such as "(Already answered above.)". Returning only
|
||||
// the terminal text would discard the real answer, which is still present
|
||||
// earlier in the transcript. When the terminal text is weak (empty, or a short
|
||||
// back-reference) fall back to the last substantive assistant content in msgs.
|
||||
//
|
||||
// msgs must already include the terminal assistant message as its last element
|
||||
// (the loop appends it before calling this); terminal is that message's text.
|
||||
func finalOutput(msgs []llm.Message, terminal string) string {
|
||||
if !isWeakFinal(terminal) {
|
||||
return terminal
|
||||
}
|
||||
if rec, ok := lastSubstantiveAssistantText(msgs, terminal); ok {
|
||||
return rec
|
||||
}
|
||||
return terminal
|
||||
}
|
||||
|
||||
// backRefRe matches a terminal turn that merely points back to an earlier
|
||||
// message instead of stating the answer ("(Already answered above.)",
|
||||
// "see above", "as I said", ...).
|
||||
var backRefRe = regexp.MustCompile(`(?i)(already answered|see above|as (i )?(said|mentioned|stated|noted)|answered (that )?above|per my (previous|earlier))`)
|
||||
|
||||
// preambleRe matches intent-announcing prefixes ("Let me search...", "I'll
|
||||
// check...") so a preamble is never mistaken for the answer during recovery.
|
||||
var preambleRe = regexp.MustCompile(`(?i)^(let me|let'?s|i'?ll|i will|first[, ]|sure[,. ]|okay[,. ]|on it|checking)`)
|
||||
|
||||
const (
|
||||
// weakFinalMaxChars bounds how long a back-reference closer can be. A
|
||||
// genuine final answer that merely contains "as I said" mid-sentence is
|
||||
// longer than this, so it is never treated as weak.
|
||||
weakFinalMaxChars = 120
|
||||
// recoverMinChars: a prior assistant turn this long is treated as a real
|
||||
// answer regardless of how it opens.
|
||||
recoverMinChars = 200
|
||||
// recoverFloorChars / recoverRatio gate the borderline band: a shorter
|
||||
// prior turn must still clearly dwarf the (very short) terminal and not
|
||||
// look like a preamble.
|
||||
recoverFloorChars = 80
|
||||
recoverRatio = 3
|
||||
)
|
||||
|
||||
// isWeakFinal reports whether a terminal turn's text fails to stand on its own
|
||||
// as the answer: empty/whitespace, or a short pure back-reference.
|
||||
func isWeakFinal(s string) bool {
|
||||
t := strings.TrimSpace(s)
|
||||
if t == "" {
|
||||
return true
|
||||
}
|
||||
return len(t) <= weakFinalMaxChars && backRefRe.MatchString(t)
|
||||
}
|
||||
|
||||
// lastSubstantiveAssistantText scans msgs newest→oldest (skipping the terminal
|
||||
// turn and empty tool-only turns) for the most recent assistant turn whose text
|
||||
// reads like a real answer. Returns ("", false) when nothing qualifies.
|
||||
func lastSubstantiveAssistantText(msgs []llm.Message, terminal string) (string, bool) {
|
||||
tt := strings.TrimSpace(terminal)
|
||||
for i := len(msgs) - 1; i >= 0; i-- {
|
||||
m := msgs[i]
|
||||
if m.Role != llm.RoleAssistant {
|
||||
continue
|
||||
}
|
||||
txt := strings.TrimSpace(m.Text())
|
||||
if txt == "" || txt == tt {
|
||||
continue // the terminal turn itself, or an empty tool-only turn
|
||||
}
|
||||
if isSubstantiveAnswer(txt, tt) {
|
||||
return txt, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// isSubstantiveAnswer reports whether txt (a prior assistant turn) reads like a
|
||||
// real answer rather than a preamble, relative to the terminal text.
|
||||
func isSubstantiveAnswer(txt, terminal string) bool {
|
||||
if len(txt) >= recoverMinChars {
|
||||
return true
|
||||
}
|
||||
return len(txt) >= recoverFloorChars &&
|
||||
len(txt) >= recoverRatio*len(terminal) &&
|
||||
!preambleRe.MatchString(txt)
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
||||
)
|
||||
|
||||
func TestIsWeakFinal(t *testing.T) {
|
||||
long := strings.Repeat("As I said, this is the full answer. ", 6) // >120, contains "as i said"
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want bool
|
||||
}{
|
||||
{"empty", "", true},
|
||||
{"whitespace", " \n\t ", true},
|
||||
{"already-answered", "(Already answered above.)", true},
|
||||
{"see-above", "see above", true},
|
||||
{"as-i-said-short", "As I said, it's 60 minutes.", true},
|
||||
{"crisp-number", "42", false},
|
||||
{"crisp-yes", "Yes.", false},
|
||||
{"crisp-status", "It's down, restarting now.", false},
|
||||
{"long-with-as-i-said", long, false}, // >120 chars: not weak despite the phrase
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := isWeakFinal(c.in); got != c.want {
|
||||
t.Errorf("isWeakFinal(%q) = %v, want %v", c.in, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func asst(text string, tools ...llm.ToolCall) llm.Message {
|
||||
m := llm.Message{Role: llm.RoleAssistant}
|
||||
if text != "" {
|
||||
m.Parts = []llm.Part{llm.Text(text)}
|
||||
}
|
||||
m.ToolCalls = tools
|
||||
return m
|
||||
}
|
||||
|
||||
func TestFinalOutput(t *testing.T) {
|
||||
cite := []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}}
|
||||
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6)) // >200
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
msgs []llm.Message
|
||||
terminal string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "front-loaded answer recovered over back-ref closer",
|
||||
msgs: []llm.Message{
|
||||
llm.UserText("q?"),
|
||||
asst(longAnswer, cite...),
|
||||
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
|
||||
asst("(Already answered above.)"),
|
||||
},
|
||||
terminal: "(Already answered above.)",
|
||||
want: longAnswer,
|
||||
},
|
||||
{
|
||||
name: "empty terminal recovers prior substantive answer",
|
||||
msgs: []llm.Message{
|
||||
llm.UserText("q?"),
|
||||
asst(longAnswer, cite...),
|
||||
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
|
||||
asst(""),
|
||||
},
|
||||
terminal: "",
|
||||
want: longAnswer,
|
||||
},
|
||||
{
|
||||
name: "healthy terminal answer is unchanged",
|
||||
msgs: []llm.Message{
|
||||
llm.UserText("q?"),
|
||||
asst("Let me check.", cite...),
|
||||
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
|
||||
asst(longAnswer),
|
||||
},
|
||||
terminal: longAnswer,
|
||||
want: longAnswer,
|
||||
},
|
||||
{
|
||||
name: "short crisp answer not overridden by a short preamble prior",
|
||||
msgs: []llm.Message{
|
||||
llm.UserText("is it up?"),
|
||||
asst("Let me check the server status.", cite...),
|
||||
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
|
||||
asst("It's down, restarting now."),
|
||||
},
|
||||
terminal: "It's down, restarting now.", // not weak → returned as-is
|
||||
want: "It's down, restarting now.",
|
||||
},
|
||||
{
|
||||
name: "weak terminal but only a preamble prior: no recovery",
|
||||
msgs: []llm.Message{
|
||||
llm.UserText("q?"),
|
||||
asst("Let me look that up for you.", cite...),
|
||||
llm.ToolResultsMessage(llm.ToolResult{ID: "c1", Name: "cite", Content: "ok"}),
|
||||
asst("(see above)"),
|
||||
},
|
||||
terminal: "(see above)",
|
||||
want: "(see above)", // preamble excluded; falls back to terminal
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := finalOutput(tc.msgs, tc.terminal); got != tc.want {
|
||||
t.Errorf("finalOutput = %q, want %q", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func citeToolbox(t *testing.T) *llm.Toolbox {
|
||||
t.Helper()
|
||||
return llm.NewToolbox("sources", llm.Tool{
|
||||
Name: "cite",
|
||||
Description: "Record a citation.",
|
||||
Parameters: json.RawMessage(`{"type":"object","properties":{}}`),
|
||||
Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
|
||||
return map[string]bool{"ok": true}, nil
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// TestRun_RecoversFrontLoadedAnswer reproduces the glm-5.2 shape end-to-end: a
|
||||
// turn carrying the full answer text AND a tool call, then a degenerate
|
||||
// terminal turn. The recovered answer must be delivered with no extra model
|
||||
// call (zero-cost recovery from the transcript).
|
||||
func TestRun_RecoversFrontLoadedAnswer(t *testing.T) {
|
||||
longAnswer := strings.TrimSpace(strings.Repeat("Free group calls are capped at sixty minutes. ", 6))
|
||||
fp := fake.New("fp")
|
||||
fp.Enqueue("test-model",
|
||||
fake.ReplyWith(llm.Response{
|
||||
Parts: []llm.Part{llm.Text(longAnswer)},
|
||||
ToolCalls: []llm.ToolCall{{ID: "c1", Name: "cite", Arguments: json.RawMessage(`{}`)}},
|
||||
FinishReason: llm.FinishToolCalls,
|
||||
Usage: llm.Usage{InputTokens: 10, OutputTokens: 5},
|
||||
}),
|
||||
fake.Reply("(Already answered above.)"),
|
||||
)
|
||||
|
||||
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
|
||||
res, err := a.Run(context.Background(), "is there a meet time limit?")
|
||||
if err != nil {
|
||||
t.Fatalf("Run: %v", err)
|
||||
}
|
||||
if res.Output != longAnswer {
|
||||
t.Errorf("Output = %q, want recovered front-loaded answer", res.Output)
|
||||
}
|
||||
if n := len(fp.Calls()); n != 2 {
|
||||
t.Errorf("model calls = %d, want 2 (no extra nudge turn)", n)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRun_HealthyTerminalUnchanged guards against regressing the normal case:
|
||||
// a deferred answer in the terminal turn is delivered verbatim.
|
||||
func TestRun_HealthyTerminalUnchanged(t *testing.T) {
|
||||
fp := fake.New("fp")
|
||||
fp.Enqueue("test-model",
|
||||
toolCallReply("c1", "cite", `{}`),
|
||||
fake.Reply("The limit is 60 minutes for free group calls."),
|
||||
)
|
||||
a := New(newModel(t, fp), "sys", WithToolbox(citeToolbox(t)))
|
||||
res, err := a.Run(context.Background(), "q?")
|
||||
if err != nil {
|
||||
t.Fatalf("Run: %v", err)
|
||||
}
|
||||
if res.Output != "The limit is 60 minutes for free group calls." {
|
||||
t.Errorf("Output = %q, want terminal answer unchanged", res.Output)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user