Files
gadfly/cmd/gadfly/wrapup_test.go
T
Steve Dudenhoeffer c0d0152a34 Gadfly: agentic adversarial PR reviewer (initial extraction)
Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in
Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/
find_files/get_diff), verifies findings before reporting, two-pass review +
adversarial recheck, posts one labeled comment per model. Advisory only.

- cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo
- entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML)
- Dockerfile: multi-stage; build-time module token never reaches the final image
- .gitea/workflows/build-image.yml: tag v* → build & push image
- examples/: ~15-line consumer stub
- system prompt genericized + hardened to re-derive constants/formulas (semantic bugs)

Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-25 18:42:20 -04:00

144 lines
4.5 KiB
Go

package main
import (
"context"
"encoding/json"
"fmt"
"strings"
"testing"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)
// spinToolCall is a response that asks for the get_diff tool (which succeeds and
// ignores extra args), used to burn agent steps without producing a final
// answer. The args vary by n so successive calls are not byte-identical — that
// dodges the agent's same-call loop guard, exactly as a real reviewer making
// distinct tool calls would.
func spinToolCall(n int) fake.Step {
return fake.ReplyWith(llm.Response{
ToolCalls: []llm.ToolCall{{
ID: "call",
Name: "get_diff",
Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
}},
FinishReason: llm.FinishToolCalls,
Usage: llm.Usage{InputTokens: 1, OutputTokens: 1},
})
}
// lastUserText returns the text of the final message in the request, which is
// what a fresh Generate call is reacting to.
func lastUserText(req llm.Request) string {
if len(req.Messages) == 0 {
return ""
}
return req.Messages[len(req.Messages)-1].Text()
}
// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
// it is nudged to wrap up should still finish inside its budget — the steer
// message arrives a few steps before the cap and the model writes its answer.
func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "4")
final := "VERDICT: No material issues found."
nudgeSeen := false
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
nudgeSeen = true
return fake.Reply(final)
}
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
if err != nil {
t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
}
if out != final {
t.Errorf("expected final review %q, got %q", final, out)
}
if !nudgeSeen {
t.Error("the wrap-up nudge was never delivered to the model")
}
}
// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
// spins on tools until the cap should NOT hard-fail — the tool-free finalization
// pass forces a final answer out of the transcript.
func TestRunAgent_FinalizationFallback(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "2")
final := "VERDICT: Minor issues\n- something"
forcedCalled := false
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
// Only the tool-free finalization pass forbids tools — reply there.
if req.ToolChoice == "none" {
forcedCalled = true
return fake.Reply(final)
}
// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
if err != nil {
t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
}
if !forcedCalled {
t.Error("finalization fallback was never invoked")
}
if out != final {
t.Errorf("expected forced final answer %q, got %q", final, out)
}
}
// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
// yields nothing, runAgent surfaces an error rather than a phantom success.
func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
if req.ToolChoice == "none" {
return fake.Reply(" ") // finalization produces only whitespace
}
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
t.Error("runAgent should error when the finalization fallback also yields no output")
}
}
func TestWrapUpReserve(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "")
if got := wrapUpReserve(); got != defaultWrapUpReserve {
t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
}
t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
if got := wrapUpReserve(); got != 7 {
t.Errorf("wrap-up reserve override = %d, want 7", got)
}
}