c0d0152a34
Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/ find_files/get_diff), verifies findings before reporting, two-pass review + adversarial recheck, posts one labeled comment per model. Advisory only. - cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo - entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML) - Dockerfile: multi-stage; build-time module token never reaches the final image - .gitea/workflows/build-image.yml: tag v* → build & push image - examples/: ~15-line consumer stub - system prompt genericized + hardened to re-derive constants/formulas (semantic bugs) Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
144 lines
4.5 KiB
Go
144 lines
4.5 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
"testing"
|
|
|
|
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
|
)
|
|
|
|
// spinToolCall is a response that asks for the get_diff tool (which succeeds and
|
|
// ignores extra args), used to burn agent steps without producing a final
|
|
// answer. The args vary by n so successive calls are not byte-identical — that
|
|
// dodges the agent's same-call loop guard, exactly as a real reviewer making
|
|
// distinct tool calls would.
|
|
func spinToolCall(n int) fake.Step {
|
|
return fake.ReplyWith(llm.Response{
|
|
ToolCalls: []llm.ToolCall{{
|
|
ID: "call",
|
|
Name: "get_diff",
|
|
Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
|
|
}},
|
|
FinishReason: llm.FinishToolCalls,
|
|
Usage: llm.Usage{InputTokens: 1, OutputTokens: 1},
|
|
})
|
|
}
|
|
|
|
// lastUserText returns the text of the final message in the request, which is
|
|
// what a fresh Generate call is reacting to.
|
|
func lastUserText(req llm.Request) string {
|
|
if len(req.Messages) == 0 {
|
|
return ""
|
|
}
|
|
return req.Messages[len(req.Messages)-1].Text()
|
|
}
|
|
|
|
// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
|
|
// it is nudged to wrap up should still finish inside its budget — the steer
|
|
// message arrives a few steps before the cap and the model writes its answer.
|
|
func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
|
|
t.Setenv("GADFLY_WRAPUP_RESERVE", "4")
|
|
|
|
final := "VERDICT: No material issues found."
|
|
nudgeSeen := false
|
|
n := 0
|
|
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
|
if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
|
|
nudgeSeen = true
|
|
return fake.Reply(final)
|
|
}
|
|
n++
|
|
return spinToolCall(n)
|
|
}))
|
|
mdl, err := p.Model("mock")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
|
|
|
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
|
|
if err != nil {
|
|
t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
|
|
}
|
|
if out != final {
|
|
t.Errorf("expected final review %q, got %q", final, out)
|
|
}
|
|
if !nudgeSeen {
|
|
t.Error("the wrap-up nudge was never delivered to the model")
|
|
}
|
|
}
|
|
|
|
// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
|
|
// spins on tools until the cap should NOT hard-fail — the tool-free finalization
|
|
// pass forces a final answer out of the transcript.
|
|
func TestRunAgent_FinalizationFallback(t *testing.T) {
|
|
t.Setenv("GADFLY_WRAPUP_RESERVE", "2")
|
|
|
|
final := "VERDICT: Minor issues\n- something"
|
|
forcedCalled := false
|
|
n := 0
|
|
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
|
// Only the tool-free finalization pass forbids tools — reply there.
|
|
if req.ToolChoice == "none" {
|
|
forcedCalled = true
|
|
return fake.Reply(final)
|
|
}
|
|
// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
|
|
n++
|
|
return spinToolCall(n)
|
|
}))
|
|
mdl, err := p.Model("mock")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
|
|
|
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
|
|
if err != nil {
|
|
t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
|
|
}
|
|
if !forcedCalled {
|
|
t.Error("finalization fallback was never invoked")
|
|
}
|
|
if out != final {
|
|
t.Errorf("expected forced final answer %q, got %q", final, out)
|
|
}
|
|
}
|
|
|
|
// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
|
|
// yields nothing, runAgent surfaces an error rather than a phantom success.
|
|
func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
|
|
n := 0
|
|
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
|
if req.ToolChoice == "none" {
|
|
return fake.Reply(" ") // finalization produces only whitespace
|
|
}
|
|
n++
|
|
return spinToolCall(n)
|
|
}))
|
|
mdl, err := p.Model("mock")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
|
|
|
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
|
|
t.Error("runAgent should error when the finalization fallback also yields no output")
|
|
}
|
|
}
|
|
|
|
func TestWrapUpReserve(t *testing.T) {
|
|
t.Setenv("GADFLY_WRAPUP_RESERVE", "")
|
|
if got := wrapUpReserve(); got != defaultWrapUpReserve {
|
|
t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
|
|
}
|
|
t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
|
|
if got := wrapUpReserve(); got != 7 {
|
|
t.Errorf("wrap-up reserve override = %d, want 7", got)
|
|
}
|
|
}
|