gadfly/cmd/gadfly/wrapup_test.go

package main

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"testing"

	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)

// spinToolCall is a response that asks for the get_diff tool (which succeeds and
// ignores extra args), used to burn agent steps without producing a final
// answer. The args vary by n so successive calls are not byte-identical — that
// dodges the agent's same-call loop guard, exactly as a real reviewer making
// distinct tool calls would.
func spinToolCall(n int) fake.Step {
	return fake.ReplyWith(llm.Response{
		ToolCalls: []llm.ToolCall{{
			ID:        "call",
			Name:      "get_diff",
			Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
		}},
		FinishReason: llm.FinishToolCalls,
		Usage:        llm.Usage{InputTokens: 1, OutputTokens: 1},
	})
}

// lastUserText returns the text of the final message in the request, which is
// what a fresh Generate call is reacting to.
func lastUserText(req llm.Request) string {
	if len(req.Messages) == 0 {
		return ""
	}
	return req.Messages[len(req.Messages)-1].Text()
}

// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
// it is nudged to wrap up should still finish inside its budget — the steer
// message arrives a few steps before the cap and the model writes its answer.
func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
	t.Setenv("GADFLY_WRAPUP_RESERVE", "4")

	final := "VERDICT: No material issues found."
	nudgeSeen := false
	n := 0
	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
		if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
			nudgeSeen = true
			return fake.Reply(final)
		}
		n++
		return spinToolCall(n)
	}))
	mdl, err := p.Model("mock")
	if err != nil {
		t.Fatal(err)
	}
	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")

	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
	if err != nil {
		t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
	}
	if out != final {
		t.Errorf("expected final review %q, got %q", final, out)
	}
	if !nudgeSeen {
		t.Error("the wrap-up nudge was never delivered to the model")
	}
}

// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
// spins on tools until the cap should NOT hard-fail — the tool-free finalization
// pass forces a final answer out of the transcript.
func TestRunAgent_FinalizationFallback(t *testing.T) {
	t.Setenv("GADFLY_WRAPUP_RESERVE", "2")

	final := "VERDICT: Minor issues\n- something"
	forcedCalled := false
	n := 0
	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
		// Only the tool-free finalization pass forbids tools — reply there.
		if req.ToolChoice == "none" {
			forcedCalled = true
			return fake.Reply(final)
		}
		// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
		n++
		return spinToolCall(n)
	}))
	mdl, err := p.Model("mock")
	if err != nil {
		t.Fatal(err)
	}
	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")

	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
	if err != nil {
		t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
	}
	if !forcedCalled {
		t.Error("finalization fallback was never invoked")
	}
	if out != final {
		t.Errorf("expected forced final answer %q, got %q", final, out)
	}
}

// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
// yields nothing, runAgent surfaces an error rather than a phantom success.
func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
	n := 0
	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
		if req.ToolChoice == "none" {
			return fake.Reply("   ") // finalization produces only whitespace
		}
		n++
		return spinToolCall(n)
	}))
	mdl, err := p.Model("mock")
	if err != nil {
		t.Fatal(err)
	}
	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")

	if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
		t.Error("runAgent should error when the finalization fallback also yields no output")
	}
}

func TestWrapUpReserve(t *testing.T) {
	t.Setenv("GADFLY_WRAPUP_RESERVE", "")
	if got := wrapUpReserve(); got != defaultWrapUpReserve {
		t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
	}
	t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
	if got := wrapUpReserve(); got != 7 {
		t.Errorf("wrap-up reserve override = %d, want 7", got)
	}
}