Gadfly: agentic adversarial PR reviewer (initial extraction)
Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/ find_files/get_diff), verifies findings before reporting, two-pass review + adversarial recheck, posts one labeled comment per model. Advisory only. - cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo - entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML) - Dockerfile: multi-stage; build-time module token never reaches the final image - .gitea/workflows/build-image.yml: tag v* → build & push image - examples/: ~15-line consumer stub - system prompt genericized + hardened to re-derive constants/formulas (semantic bugs) Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,287 @@
|
||||
// Command gadfly is the agentic backend for the PR adversarial-review
|
||||
// workflow (.gitea/workflows/pr-adversarial-review.yml). Unlike the old
|
||||
// one-shot chat call, it runs a tool-using agent (majordomo + Ollama Cloud)
|
||||
// over the PR's CHECKED-OUT repository: the model can read_file / list_dir /
|
||||
// grep / find_files / get_diff to VERIFY a finding before reporting it, which
|
||||
// kills the "diff-only" false positives (claiming a missing import or a
|
||||
// non-existent method it simply couldn't see).
|
||||
//
|
||||
// It is a pure producer of review text: it reads the diff + the repo and
|
||||
// prints the review markdown to stdout. All Gitea I/O (fetching the diff,
|
||||
// upserting the comment) stays in run.sh, so this binary needs no repo write
|
||||
// access and is straightforward to unit-test.
|
||||
//
|
||||
// Two passes (unless the draft is a clean "no material issues" pass): a
|
||||
// REVIEW pass produces a draft, then an adversarial RECHECK pass independently
|
||||
// re-verifies every finding against the actual files with the same tools and
|
||||
// drops the ones it cannot confirm, recomputing the verdict. This catches the
|
||||
// "confident but wrong" findings that survive a single pass — e.g. claiming an
|
||||
// env var is unset when a wrapper script sets it (see recheck.go).
|
||||
//
|
||||
// Inputs (env):
|
||||
//
|
||||
// OLLAMA_API_KEY Ollama Cloud bearer key (required).
|
||||
// GADFLY_MODEL model id, e.g. "qwen3-coder:480b-cloud" (required).
|
||||
// GADFLY_REPO_DIR path to the checked-out repo (required; the FS sandbox root).
|
||||
// GADFLY_DIFF_FILE path to a file holding the full unified diff (required).
|
||||
// GADFLY_SYSTEM_FILE path to the reviewer system prompt (required).
|
||||
// GADFLY_TITLE PR title (optional).
|
||||
// GADFLY_BODY PR description (optional).
|
||||
// GADFLY_MAX_STEPS review-pass step cap (optional, default 24).
|
||||
// GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to
|
||||
// stop investigating and write its answer (optional,
|
||||
// default 4). Plus a tool-free finalization fallback
|
||||
// guarantees a step-exhausted pass still emits output.
|
||||
// GADFLY_RECHECK set to 0/false to skip the recheck pass (optional, default on).
|
||||
// GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16).
|
||||
// GADFLY_TIMEOUT_SECS overall deadline in seconds, shared by both passes (optional, default 300).
|
||||
// GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000;
|
||||
// the full diff is always available via the get_diff tool).
|
||||
//
|
||||
// On success it prints the review to stdout and exits 0. On a usage/config or
|
||||
// model error it prints a diagnostic to stderr and exits non-zero; run.sh then
|
||||
// posts a "reviewer failed" notice (advisory — never fails the CI job).
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
|
||||
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultMaxSteps = 24
|
||||
defaultTimeoutSecs = 300
|
||||
defaultMaxDiffChars = 60000
|
||||
// defaultWrapUpReserve is how many steps before the cap the agent is told
|
||||
// to stop investigating and write its final answer. Reserving a margin is
|
||||
// what keeps a thorough reviewer from spending its whole budget on tool
|
||||
// calls and then hard-failing with "max steps reached without a final
|
||||
// answer" — it always has a few steps left to wrap up.
|
||||
defaultWrapUpReserve = 4
|
||||
)
|
||||
|
||||
// wrapUpInstruction is steered into a running agent once it comes within the
|
||||
// wrap-up reserve of its step cap: a forceful nudge to stop calling tools and
|
||||
// emit the final answer using only what it has already gathered.
|
||||
const wrapUpInstruction = "⚠️ You are almost out of your investigation budget — only a few tool steps remain. " +
|
||||
"STOP calling tools now and write your FINAL answer immediately, using only what you have already verified. " +
|
||||
"Do not begin any new investigation. If a finding could not be confirmed, drop it or mark it explicitly as unverified. " +
|
||||
"Output the review in the required format right now."
|
||||
|
||||
// finalizeInstruction is the user message sent on the tool-free fallback pass
|
||||
// when the agent exhausted its budget (or tripped a loop guard) without ever
|
||||
// producing a final answer. It forces the model to synthesize whatever it has.
|
||||
const finalizeInstruction = "You have run out of investigation steps. Do NOT call any tools. " +
|
||||
"Based solely on what you have already gathered above, write your final answer now in the required format. " +
|
||||
"If you could not confirm some findings, omit them or mark them as unverified, but produce the answer."
|
||||
|
||||
func main() {
|
||||
if err := run(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "gadfly:", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run() error {
|
||||
apiKey := os.Getenv("OLLAMA_API_KEY")
|
||||
if apiKey == "" {
|
||||
return errors.New("OLLAMA_API_KEY is required")
|
||||
}
|
||||
model := os.Getenv("GADFLY_MODEL")
|
||||
repoDir := os.Getenv("GADFLY_REPO_DIR")
|
||||
diffFile := os.Getenv("GADFLY_DIFF_FILE")
|
||||
systemFile := os.Getenv("GADFLY_SYSTEM_FILE")
|
||||
if model == "" || repoDir == "" || diffFile == "" || systemFile == "" {
|
||||
return errors.New("GADFLY_MODEL, GADFLY_REPO_DIR, GADFLY_DIFF_FILE and GADFLY_SYSTEM_FILE are all required")
|
||||
}
|
||||
|
||||
diffBytes, err := os.ReadFile(diffFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read diff file: %w", err)
|
||||
}
|
||||
diff := string(diffBytes)
|
||||
if strings.TrimSpace(diff) == "" {
|
||||
return errors.New("empty diff; nothing to review")
|
||||
}
|
||||
|
||||
systemBytes, err := os.ReadFile(systemFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read system prompt: %w", err)
|
||||
}
|
||||
|
||||
fsTools, err := newRepoFS(repoDir, diff)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mdl, err := ollama.Cloud(ollama.WithToken(apiKey)).Model(model)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build model %q: %w", model, err)
|
||||
}
|
||||
|
||||
timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
// Pass 1 — review: produce the draft.
|
||||
draft, err := runAgent(ctx, mdl, fsTools, string(systemBytes), buildTask(diff),
|
||||
envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
|
||||
if err != nil {
|
||||
return fmt.Errorf("review pass: %w", err)
|
||||
}
|
||||
|
||||
// Pass 2 — recheck: adversarially re-verify the draft's findings and drop
|
||||
// the unconfirmed ones. Skipped for a clean draft (nothing to verify) or
|
||||
// when disabled. A recheck failure is non-fatal — we emit the unverified
|
||||
// draft rather than losing the review entirely.
|
||||
final := draft
|
||||
if shouldRecheck(draft) {
|
||||
rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff),
|
||||
envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps))
|
||||
if rerr != nil {
|
||||
fmt.Fprintln(os.Stderr, "gadfly: recheck pass failed; emitting unverified draft:", rerr)
|
||||
} else {
|
||||
final = rechecked
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println(final)
|
||||
return nil
|
||||
}
|
||||
|
||||
// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
|
||||
// returns the final answer. An empty answer is an error — the caller decides
|
||||
// whether that is fatal (review pass) or recoverable (recheck pass). A
|
||||
// non-empty answer that ended on a budget/guard error is still returned: the
|
||||
// model wrote its output, then ran out of steps.
|
||||
//
|
||||
// Two mechanisms keep a step-hungry model from hard-failing with no output:
|
||||
// 1. A wrap-up steer: once the run comes within wrapUpReserve steps of the
|
||||
// cap, a forceful "stop calling tools, write your final answer" message is
|
||||
// injected so the model spends its remaining steps finalizing.
|
||||
// 2. A finalization fallback: if the loop still ends empty (the model ignored
|
||||
// the nudge, or a loop guard tripped), one tool-free model call forces a
|
||||
// final answer out of the transcript already gathered.
|
||||
func runAgent(ctx context.Context, mdl llm.Model, fsTools *repoFS, system, task string, maxSteps int) (string, error) {
|
||||
box, err := fsTools.toolbox()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
loop := agent.New(mdl, system,
|
||||
agent.WithToolbox(box),
|
||||
agent.WithMaxSteps(maxSteps),
|
||||
// Guard rails: stop the model from spinning on failing or identical
|
||||
// tool calls instead of writing its answer.
|
||||
agent.WithToolErrorLimits(4, 4),
|
||||
)
|
||||
|
||||
wrapUpAt := maxSteps - wrapUpReserve()
|
||||
if wrapUpAt < 1 {
|
||||
wrapUpAt = 1
|
||||
}
|
||||
var completed int // steps finished so far (updated after each step)
|
||||
nudged := false
|
||||
|
||||
res, runErr := loop.Run(ctx, task,
|
||||
agent.OnStep(func(s agent.Step) { completed = s.Index + 1 }),
|
||||
agent.WithSteer(func() []llm.Message {
|
||||
if !nudged && completed >= wrapUpAt {
|
||||
nudged = true
|
||||
return []llm.Message{llm.UserText(wrapUpInstruction)}
|
||||
}
|
||||
return nil
|
||||
}),
|
||||
)
|
||||
|
||||
out := ""
|
||||
if res != nil {
|
||||
out = strings.TrimSpace(res.Output)
|
||||
}
|
||||
if out != "" {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// No final answer. If we still have budget on the clock and a transcript to
|
||||
// work from, force a tool-free finalization rather than losing the pass.
|
||||
if res != nil && len(res.Messages) > 0 && ctx.Err() == nil {
|
||||
if forced := forceFinalAnswer(ctx, mdl, system, res.Messages); forced != "" {
|
||||
return forced, nil
|
||||
}
|
||||
}
|
||||
|
||||
if runErr != nil {
|
||||
return "", runErr
|
||||
}
|
||||
return "", errors.New("agent produced no output")
|
||||
}
|
||||
|
||||
// forceFinalAnswer makes one tool-free model call to squeeze a final answer out
|
||||
// of an agent that exhausted its step budget without producing one. Tools are
|
||||
// forbidden (ToolChoice "none") so the model must synthesize from the transcript
|
||||
// instead of investigating further. Best-effort: any error or empty reply
|
||||
// returns "" and the caller falls back to its normal empty-output handling.
|
||||
func forceFinalAnswer(ctx context.Context, mdl llm.Model, system string, transcript []llm.Message) string {
|
||||
msgs := append(append([]llm.Message(nil), transcript...), llm.UserText(finalizeInstruction))
|
||||
resp, err := mdl.Generate(ctx, llm.Request{
|
||||
System: system,
|
||||
Messages: msgs,
|
||||
ToolChoice: "none",
|
||||
})
|
||||
if err != nil || resp == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(resp.Text())
|
||||
}
|
||||
|
||||
// wrapUpReserve is how many steps before the cap the wrap-up nudge fires,
|
||||
// overridable via GADFLY_WRAPUP_RESERVE.
|
||||
func wrapUpReserve() int {
|
||||
return envInt("GADFLY_WRAPUP_RESERVE", defaultWrapUpReserve)
|
||||
}
|
||||
|
||||
// buildTask assembles the user message: PR metadata plus the unified diff,
|
||||
// truncated for the prompt (the full diff stays available via get_diff).
|
||||
func buildTask(diff string) string {
|
||||
title := os.Getenv("GADFLY_TITLE")
|
||||
body := os.Getenv("GADFLY_BODY")
|
||||
|
||||
maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
|
||||
truncNote := ""
|
||||
if maxDiff > 0 && len(diff) > maxDiff {
|
||||
diff = diff[:maxDiff]
|
||||
truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
if title != "" {
|
||||
fmt.Fprintf(&b, "PR title: %s\n\n", title)
|
||||
}
|
||||
if strings.TrimSpace(body) != "" {
|
||||
fmt.Fprintf(&b, "PR description:\n%s\n\n", body)
|
||||
}
|
||||
b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n")
|
||||
fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// envInt reads an integer env var, falling back to def when unset or unparseable.
|
||||
func envInt(name string, def int) int {
|
||||
v := strings.TrimSpace(os.Getenv(name))
|
||||
if v == "" {
|
||||
return def
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil || n <= 0 {
|
||||
return def
|
||||
}
|
||||
return n
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// defaultRecheckMaxSteps bounds the verification pass. It is smaller than the
|
||||
// review pass: re-checking a handful of existing findings needs fewer steps
|
||||
// than discovering them.
|
||||
const defaultRecheckMaxSteps = 16
|
||||
|
||||
// recheckSystemPrompt drives the second, adversarial verification pass. The
|
||||
// model is given a DRAFT review and must independently confirm each finding
|
||||
// against the real code before letting it survive — the antidote to a
|
||||
// single-pass reviewer that reads a couple of files, mis-connects them, and
|
||||
// posts a confident but wrong "blocking" verdict.
|
||||
const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the
|
||||
"mort" project (a large Go Discord bot). You are given a DRAFT review produced
|
||||
by another model. Your job is NOT to write a new review — it is to confirm or
|
||||
reject each finding in the draft against the ACTUAL code, then output the
|
||||
corrected review.
|
||||
|
||||
You have the same read-only repository tools as the original reviewer:
|
||||
- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path,
|
||||
max_results]), find_files(name[, max_results]), get_diff().
|
||||
|
||||
For EVERY finding in the draft:
|
||||
1. Independently reproduce the reasoning by reading the actual files with your
|
||||
tools — do not trust the draft's claim, and do not trust the diff hunk alone.
|
||||
2. KEEP the finding only if you can positively confirm it against the code.
|
||||
3. DROP the finding if you cannot confirm it, or if the code contradicts it.
|
||||
|
||||
Watch especially for findings that ignore the "glue" around a change — the most
|
||||
common false positive. Before keeping a claim that something is "missing",
|
||||
"undefined", "never set", "not exported", or "won't compile", GREP THE WHOLE
|
||||
REPO for it: the thing is very often satisfied in a place the original reviewer
|
||||
didn't look — a shell script or Makefile that sets an env var, a CI YAML, an
|
||||
adjacent file, generated code, or a wrapper that maps one name to another. A
|
||||
finding that an env var X is unset is wrong if any script invokes the program
|
||||
with "X=... prog". Check before you keep.
|
||||
|
||||
Output rules:
|
||||
- Output the corrected review in the SAME format as the draft: a one-line
|
||||
VERDICT ("No material issues found", "Minor issues", or "Blocking issues
|
||||
found"), then the surviving findings as bullets with path:line and impact.
|
||||
- Recompute the VERDICT from what SURVIVES. If every finding was dropped, the
|
||||
verdict is "No material issues found".
|
||||
- Do NOT invent new findings; this is a verification gate, not a fresh review.
|
||||
- Do NOT include meta-commentary about the verification process or which
|
||||
findings you dropped — output only the final, corrected review markdown.
|
||||
- When done investigating, STOP calling tools and reply with the review.`
|
||||
|
||||
// recheckEnabled reports whether the verification pass should run. On unless
|
||||
// GADFLY_RECHECK is explicitly a falsey value.
|
||||
func recheckEnabled() bool {
|
||||
switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) {
|
||||
case "0", "false", "no", "off":
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// shouldRecheck decides whether to run the verification pass for a given draft.
|
||||
// A clean "no material issues" draft has nothing to verify, so it is skipped
|
||||
// even when rechecking is enabled — saving a whole model pass on clean PRs.
|
||||
func shouldRecheck(draft string) bool {
|
||||
if !recheckEnabled() {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(strings.ToLower(draft), "no material issues") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// buildRecheckTask is the verification pass's user message: the draft review to
|
||||
// scrutinize, with the full diff available via get_diff (and embedded here,
|
||||
// truncated, to save a tool call).
|
||||
func buildRecheckTask(draft, diff string) string {
|
||||
maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
|
||||
truncNote := ""
|
||||
if maxDiff > 0 && len(diff) > maxDiff {
|
||||
diff = diff[:maxDiff]
|
||||
truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString("Verify the following DRAFT review against the actual code, drop every finding you cannot confirm, and output the corrected review.\n\n")
|
||||
b.WriteString("## Draft review\n\n")
|
||||
b.WriteString(draft)
|
||||
b.WriteString("\n\n## PR diff under review\n\n")
|
||||
fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
|
||||
return b.String()
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
||||
)
|
||||
|
||||
func TestShouldRecheck(t *testing.T) {
|
||||
t.Setenv("GADFLY_RECHECK", "") // default on
|
||||
|
||||
if shouldRecheck("VERDICT: Blocking issues found\n- something is wrong") != true {
|
||||
t.Error("a draft with findings should be rechecked")
|
||||
}
|
||||
if shouldRecheck("No material issues found.") != false {
|
||||
t.Error("a clean draft should skip recheck")
|
||||
}
|
||||
if shouldRecheck("### review\n\nNo material issues found.\n") != false {
|
||||
t.Error("clean draft detection should be case/whitespace tolerant")
|
||||
}
|
||||
|
||||
// Explicit disable wins even when there are findings.
|
||||
t.Setenv("GADFLY_RECHECK", "0")
|
||||
if shouldRecheck("Blocking issues found\n- x") != false {
|
||||
t.Error("GADFLY_RECHECK=0 must disable recheck")
|
||||
}
|
||||
t.Setenv("GADFLY_RECHECK", "false")
|
||||
if shouldRecheck("Blocking issues found\n- x") != false {
|
||||
t.Error("GADFLY_RECHECK=false must disable recheck")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecheckEnabled(t *testing.T) {
|
||||
for _, v := range []string{"", "1", "true", "yes", "anything"} {
|
||||
t.Setenv("GADFLY_RECHECK", v)
|
||||
if !recheckEnabled() {
|
||||
t.Errorf("GADFLY_RECHECK=%q should be enabled", v)
|
||||
}
|
||||
}
|
||||
for _, v := range []string{"0", "false", "no", "off", "OFF", " False "} {
|
||||
t.Setenv("GADFLY_RECHECK", v)
|
||||
if recheckEnabled() {
|
||||
t.Errorf("GADFLY_RECHECK=%q should be disabled", v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildRecheckTask(t *testing.T) {
|
||||
t.Setenv("GADFLY_MAX_DIFF_CHARS", "")
|
||||
draft := "VERDICT: Blocking issues found\n- foo.go:1 broken"
|
||||
out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n")
|
||||
if !strings.Contains(out, draft) {
|
||||
t.Error("recheck task must include the draft review")
|
||||
}
|
||||
if !strings.Contains(out, "Verify") || !strings.Contains(out, "drop every finding you cannot confirm") {
|
||||
t.Errorf("recheck task missing the verify instruction:\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "diff --git") {
|
||||
t.Error("recheck task should include the diff")
|
||||
}
|
||||
}
|
||||
|
||||
// fakeModel builds a fake majordomo model that always replies with the given
|
||||
// text (no tool calls), so the agent loop ends on its first step.
|
||||
func fakeModel(t *testing.T, reply string) llm.Model {
|
||||
t.Helper()
|
||||
p := fake.New("fake", fake.WithDefault(func(string, llm.Request) fake.Step {
|
||||
return fake.Reply(reply)
|
||||
}))
|
||||
m, err := p.Model("mock")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func TestRunAgent_ReturnsOutput(t *testing.T) {
|
||||
fs, err := newRepoFS(t.TempDir(), "diff")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
mdl := fakeModel(t, " corrected review: No material issues found. ")
|
||||
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4)
|
||||
if err != nil {
|
||||
t.Fatalf("runAgent: %v", err)
|
||||
}
|
||||
if out != "corrected review: No material issues found." {
|
||||
t.Errorf("runAgent should return trimmed model output, got %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunAgent_EmptyIsError(t *testing.T) {
|
||||
fs, _ := newRepoFS(t.TempDir(), "diff")
|
||||
mdl := fakeModel(t, " ")
|
||||
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
|
||||
t.Error("runAgent should error on empty model output")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,388 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
)
|
||||
|
||||
// Tool output bounds. The reviewer is a chat agent with a finite context, so
|
||||
// every tool caps how much it can pull in one call — a runaway read_file or
|
||||
// grep would blow the window and stall the loop.
|
||||
const (
|
||||
maxFileBytes = 64 * 1024 // per read_file call
|
||||
maxReadLines = 800 // per read_file call
|
||||
maxGrepResults = 200 // per grep call
|
||||
maxFindResults = 200 // per find_files call
|
||||
maxLineLen = 400 // truncate any single returned line to this
|
||||
)
|
||||
|
||||
// skipDirs are never descended into by grep / find_files — noise and bulk that
|
||||
// a code reviewer never needs and that would swamp the results.
|
||||
var skipDirs = map[string]bool{
|
||||
".git": true,
|
||||
"node_modules": true,
|
||||
"vendor": true,
|
||||
}
|
||||
|
||||
// repoFS is a read-only, sandboxed view of the checked-out repository. Every
|
||||
// path argument from the model is resolved against root and rejected if it
|
||||
// escapes (symlink or `..` traversal), so a hostile diff can never make the
|
||||
// reviewer read outside the checkout.
|
||||
type repoFS struct {
|
||||
root string // absolute, symlink-resolved repo root
|
||||
diff string // the full PR unified diff (served by get_diff)
|
||||
}
|
||||
|
||||
// newRepoFS resolves root to an absolute, symlink-free path.
|
||||
func newRepoFS(root, diff string) (*repoFS, error) {
|
||||
abs, err := filepath.Abs(root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve repo dir: %w", err)
|
||||
}
|
||||
// EvalSymlinks so prefix containment checks survive a symlinked root
|
||||
// (e.g. macOS /tmp -> /private/tmp).
|
||||
if resolved, err := filepath.EvalSymlinks(abs); err == nil {
|
||||
abs = resolved
|
||||
}
|
||||
info, err := os.Stat(abs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("repo dir %q: %w", root, err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return nil, fmt.Errorf("repo dir %q is not a directory", root)
|
||||
}
|
||||
return &repoFS{root: abs, diff: diff}, nil
|
||||
}
|
||||
|
||||
// resolve maps a model-supplied relative path to an absolute path inside the
|
||||
// sandbox, rejecting anything that escapes root. An empty path means root.
|
||||
func (r *repoFS) resolve(rel string) (string, error) {
|
||||
rel = strings.TrimSpace(rel)
|
||||
rel = strings.TrimPrefix(rel, "./")
|
||||
if rel == "" || rel == "." {
|
||||
return r.root, nil
|
||||
}
|
||||
if filepath.IsAbs(rel) {
|
||||
// Allow an absolute path only if it already points inside the sandbox.
|
||||
clean := filepath.Clean(rel)
|
||||
if err := r.contains(clean); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return clean, nil
|
||||
}
|
||||
joined := filepath.Clean(filepath.Join(r.root, rel))
|
||||
if err := r.contains(joined); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return joined, nil
|
||||
}
|
||||
|
||||
// contains verifies abs is root or lives beneath it.
|
||||
func (r *repoFS) contains(abs string) error {
|
||||
if abs == r.root {
|
||||
return nil
|
||||
}
|
||||
if !strings.HasPrefix(abs, r.root+string(os.PathSeparator)) {
|
||||
return fmt.Errorf("path escapes the repository sandbox")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// toolbox builds the read-only review toolbox over this sandbox.
|
||||
func (r *repoFS) toolbox() (*llm.Toolbox, error) {
|
||||
box := llm.NewToolbox("gadfly")
|
||||
tools := []llm.Tool{
|
||||
r.readFileTool(),
|
||||
r.listDirTool(),
|
||||
r.grepTool(),
|
||||
r.findFilesTool(),
|
||||
r.getDiffTool(),
|
||||
}
|
||||
for _, t := range tools {
|
||||
if err := box.Add(t); err != nil {
|
||||
return nil, fmt.Errorf("add tool %q: %w", t.Name, err)
|
||||
}
|
||||
}
|
||||
return box, nil
|
||||
}
|
||||
|
||||
type readFileArgs struct {
|
||||
Path string `json:"path" description:"Repository-relative path of the file to read, e.g. pkg/logic/agentexec/pipeline.go"`
|
||||
StartLine int `json:"start_line,omitempty" description:"Optional 1-based line to start from (default 1)."`
|
||||
Limit int `json:"limit,omitempty" description:"Optional max number of lines to return (default/maximum 800)."`
|
||||
}
|
||||
|
||||
func (r *repoFS) readFileTool() llm.Tool {
|
||||
return llm.DefineTool[readFileArgs](
|
||||
"read_file",
|
||||
"Read a file from the repository at its current checked-out state, with line numbers. Use this to verify the surrounding code, imports, and symbols a diff hunk touches before reporting an issue.",
|
||||
func(_ context.Context, args readFileArgs) (any, error) {
|
||||
abs, err := r.resolve(args.Path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
info, err := os.Stat(abs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stat %q: %w", args.Path, err)
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil, fmt.Errorf("%q is a directory; use list_dir", args.Path)
|
||||
}
|
||||
f, err := os.Open(abs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open %q: %w", args.Path, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
start := args.StartLine
|
||||
if start < 1 {
|
||||
start = 1
|
||||
}
|
||||
limit := args.Limit
|
||||
if limit <= 0 || limit > maxReadLines {
|
||||
limit = maxReadLines
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
sc := bufio.NewScanner(f)
|
||||
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
|
||||
lineNo := 0
|
||||
emitted := 0
|
||||
for sc.Scan() {
|
||||
lineNo++
|
||||
if lineNo < start {
|
||||
continue
|
||||
}
|
||||
if emitted >= limit || b.Len() >= maxFileBytes {
|
||||
fmt.Fprintf(&b, "... (truncated at line %d; call read_file again with start_line=%d for more)\n", lineNo, lineNo)
|
||||
break
|
||||
}
|
||||
line := sc.Text()
|
||||
if len(line) > maxLineLen {
|
||||
line = line[:maxLineLen] + "…"
|
||||
}
|
||||
fmt.Fprintf(&b, "%d\t%s\n", lineNo, line)
|
||||
emitted++
|
||||
}
|
||||
if err := sc.Err(); err != nil {
|
||||
return nil, fmt.Errorf("read %q: %w", args.Path, err)
|
||||
}
|
||||
if emitted == 0 {
|
||||
return fmt.Sprintf("(%s has no lines at/after %d; file has %d lines)", args.Path, start, lineNo), nil
|
||||
}
|
||||
return b.String(), nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
type listDirArgs struct {
|
||||
Path string `json:"path,omitempty" description:"Optional repository-relative directory (default: repo root)."`
|
||||
}
|
||||
|
||||
func (r *repoFS) listDirTool() llm.Tool {
|
||||
return llm.DefineTool[listDirArgs](
|
||||
"list_dir",
|
||||
"List the entries of a directory in the repository (directories marked with a trailing /). Use it to discover where code lives before reading.",
|
||||
func(_ context.Context, args listDirArgs) (any, error) {
|
||||
abs, err := r.resolve(args.Path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
entries, err := os.ReadDir(abs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list %q: %w", args.Path, err)
|
||||
}
|
||||
names := make([]string, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if e.IsDir() {
|
||||
name += "/"
|
||||
}
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
if len(names) == 0 {
|
||||
return "(empty directory)", nil
|
||||
}
|
||||
return strings.Join(names, "\n"), nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
type grepArgs struct {
|
||||
Pattern string `json:"pattern" description:"A Go (RE2) regular expression to search for."`
|
||||
Path string `json:"path,omitempty" description:"Optional repository-relative file or subdirectory to scope the search (default: whole repo)."`
|
||||
MaxResults int `json:"max_results,omitempty" description:"Optional cap on matching lines returned (default/maximum 200)."`
|
||||
}
|
||||
|
||||
func (r *repoFS) grepTool() llm.Tool {
|
||||
return llm.DefineTool[grepArgs](
|
||||
"grep",
|
||||
"Search the repository's text files for a regular expression and return matching `path:line: text`. Use it to check whether a symbol, import, or call exists elsewhere before claiming a cross-file problem.",
|
||||
func(_ context.Context, args grepArgs) (any, error) {
|
||||
if strings.TrimSpace(args.Pattern) == "" {
|
||||
return nil, fmt.Errorf("pattern is required")
|
||||
}
|
||||
re, err := regexp.Compile(args.Pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid regexp: %w", err)
|
||||
}
|
||||
base, err := r.resolve(args.Path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
limit := args.MaxResults
|
||||
if limit <= 0 || limit > maxGrepResults {
|
||||
limit = maxGrepResults
|
||||
}
|
||||
|
||||
var out []string
|
||||
truncated := false
|
||||
walkErr := filepath.WalkDir(base, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return nil // skip unreadable entries
|
||||
}
|
||||
if d.IsDir() {
|
||||
if skipDirs[d.Name()] && path != base {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(out) >= limit {
|
||||
truncated = true
|
||||
return filepath.SkipAll
|
||||
}
|
||||
matchesInFile(path, r.root, re, limit, &out)
|
||||
return nil
|
||||
})
|
||||
if walkErr != nil {
|
||||
return nil, fmt.Errorf("search: %w", walkErr)
|
||||
}
|
||||
if len(out) > limit {
|
||||
out = out[:limit]
|
||||
truncated = true
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return "(no matches)", nil
|
||||
}
|
||||
res := strings.Join(out, "\n")
|
||||
if truncated {
|
||||
res += fmt.Sprintf("\n... (truncated at %d matches; narrow the pattern or path)", limit)
|
||||
}
|
||||
return res, nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// matchesInFile appends "relpath:line: text" for each regexp match in a single
|
||||
// text file, stopping once the global cap is reached. Binary files (NUL in the
|
||||
// first chunk) and oversized files are skipped.
|
||||
func matchesInFile(path, root string, re *regexp.Regexp, limit int, out *[]string) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
rel, relErr := filepath.Rel(root, path)
|
||||
if relErr != nil {
|
||||
rel = path
|
||||
}
|
||||
sc := bufio.NewScanner(f)
|
||||
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
|
||||
lineNo := 0
|
||||
for sc.Scan() {
|
||||
if len(*out) >= limit {
|
||||
return
|
||||
}
|
||||
lineNo++
|
||||
line := sc.Text()
|
||||
if lineNo == 1 && strings.IndexByte(line, 0) >= 0 {
|
||||
return // looks binary
|
||||
}
|
||||
if re.MatchString(line) {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if len(trimmed) > maxLineLen {
|
||||
trimmed = trimmed[:maxLineLen] + "…"
|
||||
}
|
||||
*out = append(*out, fmt.Sprintf("%s:%d: %s", rel, lineNo, trimmed))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type findFilesArgs struct {
|
||||
Name string `json:"name" description:"Case-insensitive substring of the file path to match, e.g. \"pipeline.go\" or \"agentexec/\"."`
|
||||
MaxResults int `json:"max_results,omitempty" description:"Optional cap on paths returned (default/maximum 200)."`
|
||||
}
|
||||
|
||||
func (r *repoFS) findFilesTool() llm.Tool {
|
||||
return llm.DefineTool[findFilesArgs](
|
||||
"find_files",
|
||||
"Find files whose repository-relative path contains a case-insensitive substring. Use it to locate a file by name when you don't know its directory.",
|
||||
func(_ context.Context, args findFilesArgs) (any, error) {
|
||||
needle := strings.ToLower(strings.TrimSpace(args.Name))
|
||||
if needle == "" {
|
||||
return nil, fmt.Errorf("name is required")
|
||||
}
|
||||
limit := args.MaxResults
|
||||
if limit <= 0 || limit > maxFindResults {
|
||||
limit = maxFindResults
|
||||
}
|
||||
var out []string
|
||||
truncated := false
|
||||
_ = filepath.WalkDir(r.root, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
if d.IsDir() {
|
||||
if skipDirs[d.Name()] && path != r.root {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(out) >= limit {
|
||||
truncated = true
|
||||
return filepath.SkipAll
|
||||
}
|
||||
rel, relErr := filepath.Rel(r.root, path)
|
||||
if relErr != nil {
|
||||
return nil
|
||||
}
|
||||
if strings.Contains(strings.ToLower(rel), needle) {
|
||||
out = append(out, rel)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
sort.Strings(out)
|
||||
if len(out) == 0 {
|
||||
return "(no files matched)", nil
|
||||
}
|
||||
res := strings.Join(out, "\n")
|
||||
if truncated {
|
||||
res += fmt.Sprintf("\n... (truncated at %d files; narrow the name)", limit)
|
||||
}
|
||||
return res, nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func (r *repoFS) getDiffTool() llm.Tool {
|
||||
return llm.DefineTool[struct{}](
|
||||
"get_diff",
|
||||
"Return the complete unified diff under review. The diff is also included (possibly truncated) in the task message; call this to get the full, untruncated text.",
|
||||
func(_ context.Context, _ struct{}) (any, error) {
|
||||
if strings.TrimSpace(r.diff) == "" {
|
||||
return "(empty diff)", nil
|
||||
}
|
||||
return r.diff, nil
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// buildFixtureRepo lays down a small repo tree for the toolbox tests and
|
||||
// returns its root.
|
||||
func buildFixtureRepo(t *testing.T) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
write := func(rel, content string) {
|
||||
p := filepath.Join(root, rel)
|
||||
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(p, []byte(content), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
write("pkg/foo/foo.go", "package foo\n\nfunc Hello() string {\n\treturn \"hi\"\n}\n")
|
||||
write("pkg/foo/bar.go", "package foo\n\n// TODO: refactor\nvar Answer = 42\n")
|
||||
write("README.md", "# Fixture\n\nHello world.\n")
|
||||
write(".git/config", "[core]\n\tbare = false\n") // must be skipped by grep/find
|
||||
write("secret.txt", "this file lives at the repo root\n")
|
||||
return root
|
||||
}
|
||||
|
||||
// call invokes a tool from the sandbox's toolbox by name with JSON args and
|
||||
// returns the result string (or the error).
|
||||
func call(t *testing.T, fs *repoFS, name string, args map[string]any) (string, error) {
|
||||
t.Helper()
|
||||
box, err := fs.toolbox()
|
||||
if err != nil {
|
||||
t.Fatalf("toolbox: %v", err)
|
||||
}
|
||||
tool, ok := box.Get(name)
|
||||
if !ok {
|
||||
t.Fatalf("tool %q not in toolbox", name)
|
||||
}
|
||||
raw, err := json.Marshal(args)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
out, herr := tool.Handler(context.Background(), raw)
|
||||
if herr != nil {
|
||||
return "", herr
|
||||
}
|
||||
s, _ := out.(string)
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func TestRepoFS_ResolveSandbox(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
fs, err := newRepoFS(root, "")
|
||||
if err != nil {
|
||||
t.Fatalf("newRepoFS: %v", err)
|
||||
}
|
||||
|
||||
// In-bounds paths resolve.
|
||||
if _, err := fs.resolve("pkg/foo/foo.go"); err != nil {
|
||||
t.Errorf("in-bounds path rejected: %v", err)
|
||||
}
|
||||
if got, err := fs.resolve(""); err != nil || got != fs.root {
|
||||
t.Errorf("empty path should be root: got %q err %v", got, err)
|
||||
}
|
||||
|
||||
// Escapes are rejected.
|
||||
for _, bad := range []string{"../outside", "../../etc/passwd", "pkg/../../escape", "/etc/passwd"} {
|
||||
if _, err := fs.resolve(bad); err == nil {
|
||||
t.Errorf("path %q escaped the sandbox but was allowed", bad)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadFileTool(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
fs, _ := newRepoFS(root, "")
|
||||
|
||||
out, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go"})
|
||||
if err != nil {
|
||||
t.Fatalf("read_file: %v", err)
|
||||
}
|
||||
if !strings.Contains(out, "func Hello()") {
|
||||
t.Errorf("expected file body, got:\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "1\t") {
|
||||
t.Errorf("expected line numbers, got:\n%s", out)
|
||||
}
|
||||
|
||||
// Line slicing.
|
||||
out, err = call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go", "start_line": 3, "limit": 1})
|
||||
if err != nil {
|
||||
t.Fatalf("read_file slice: %v", err)
|
||||
}
|
||||
if !strings.Contains(out, "func Hello()") || strings.Contains(out, "package foo") {
|
||||
t.Errorf("slice should start at line 3 only, got:\n%s", out)
|
||||
}
|
||||
|
||||
// Reading a directory is an error directing to list_dir.
|
||||
if _, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo"}); err == nil {
|
||||
t.Error("reading a directory should error")
|
||||
}
|
||||
|
||||
// Escape is rejected.
|
||||
if _, err := call(t, fs, "read_file", map[string]any{"path": "../escape"}); err == nil {
|
||||
t.Error("read_file should reject sandbox escape")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListDirTool(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
fs, _ := newRepoFS(root, "")
|
||||
|
||||
out, err := call(t, fs, "list_dir", map[string]any{"path": "pkg/foo"})
|
||||
if err != nil {
|
||||
t.Fatalf("list_dir: %v", err)
|
||||
}
|
||||
for _, want := range []string{"foo.go", "bar.go"} {
|
||||
if !strings.Contains(out, want) {
|
||||
t.Errorf("list_dir missing %q in:\n%s", want, out)
|
||||
}
|
||||
}
|
||||
|
||||
// Root listing marks directories with a trailing slash.
|
||||
out, _ = call(t, fs, "list_dir", map[string]any{})
|
||||
if !strings.Contains(out, "pkg/") {
|
||||
t.Errorf("expected pkg/ (dir with trailing slash) in root listing:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGrepTool(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
fs, _ := newRepoFS(root, "")
|
||||
|
||||
out, err := call(t, fs, "grep", map[string]any{"pattern": "func Hello"})
|
||||
if err != nil {
|
||||
t.Fatalf("grep: %v", err)
|
||||
}
|
||||
if !strings.Contains(out, "pkg/foo/foo.go:") {
|
||||
t.Errorf("grep should locate the func, got:\n%s", out)
|
||||
}
|
||||
|
||||
// .git is skipped.
|
||||
out, _ = call(t, fs, "grep", map[string]any{"pattern": "bare = false"})
|
||||
if strings.Contains(out, ".git/") {
|
||||
t.Errorf("grep must not descend into .git, got:\n%s", out)
|
||||
}
|
||||
|
||||
// No matches is a clean message, not an error.
|
||||
out, err = call(t, fs, "grep", map[string]any{"pattern": "zzz_no_such_token_zzz"})
|
||||
if err != nil || !strings.Contains(out, "no matches") {
|
||||
t.Errorf("expected clean no-match, got %q err %v", out, err)
|
||||
}
|
||||
|
||||
// Invalid regexp surfaces as an error.
|
||||
if _, err := call(t, fs, "grep", map[string]any{"pattern": "([unterminated"}); err == nil {
|
||||
t.Error("invalid regexp should error")
|
||||
}
|
||||
|
||||
// Scoped grep honors the path.
|
||||
out, _ = call(t, fs, "grep", map[string]any{"pattern": "Answer", "path": "pkg/foo/bar.go"})
|
||||
if !strings.Contains(out, "bar.go:") {
|
||||
t.Errorf("scoped grep missed the match:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindFilesTool(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
fs, _ := newRepoFS(root, "")
|
||||
|
||||
out, err := call(t, fs, "find_files", map[string]any{"name": "foo.go"})
|
||||
if err != nil {
|
||||
t.Fatalf("find_files: %v", err)
|
||||
}
|
||||
if !strings.Contains(out, "pkg/foo/foo.go") {
|
||||
t.Errorf("find_files missed foo.go:\n%s", out)
|
||||
}
|
||||
|
||||
// Case-insensitive substring on the path.
|
||||
out, _ = call(t, fs, "find_files", map[string]any{"name": "PKG/FOO"})
|
||||
if !strings.Contains(out, "pkg/foo/") {
|
||||
t.Errorf("find_files should be case-insensitive on the path:\n%s", out)
|
||||
}
|
||||
|
||||
// .git entries are not surfaced.
|
||||
out, _ = call(t, fs, "find_files", map[string]any{"name": "config"})
|
||||
if strings.Contains(out, ".git/") {
|
||||
t.Errorf("find_files must skip .git, got:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetDiffTool(t *testing.T) {
|
||||
root := buildFixtureRepo(t)
|
||||
const diff = "diff --git a/x b/x\n+added line\n"
|
||||
fs, _ := newRepoFS(root, diff)
|
||||
|
||||
out, err := call(t, fs, "get_diff", map[string]any{})
|
||||
if err != nil {
|
||||
t.Fatalf("get_diff: %v", err)
|
||||
}
|
||||
if out != diff {
|
||||
t.Errorf("get_diff returned %q, want %q", out, diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewRepoFS_BadRoot(t *testing.T) {
|
||||
// A file (not a directory) is rejected.
|
||||
f := filepath.Join(t.TempDir(), "afile")
|
||||
if err := os.WriteFile(f, []byte("x"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := newRepoFS(f, ""); err == nil {
|
||||
t.Error("newRepoFS should reject a non-directory root")
|
||||
}
|
||||
if _, err := newRepoFS(filepath.Join(t.TempDir(), "missing"), ""); err == nil {
|
||||
t.Error("newRepoFS should reject a missing root")
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the toolbox exposes exactly the expected tools (guards against an
|
||||
// accidental rename breaking the system prompt's tool references).
|
||||
func TestToolbox_Names(t *testing.T) {
|
||||
fs, _ := newRepoFS(t.TempDir(), "")
|
||||
box, err := fs.toolbox()
|
||||
if err != nil {
|
||||
t.Fatalf("toolbox: %v", err)
|
||||
}
|
||||
got := map[string]bool{}
|
||||
for _, tl := range box.Tools() {
|
||||
got[tl.Name] = true
|
||||
}
|
||||
for _, want := range []string{"read_file", "list_dir", "grep", "find_files", "get_diff"} {
|
||||
if !got[want] {
|
||||
t.Errorf("toolbox missing tool %q", want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
||||
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
|
||||
)
|
||||
|
||||
// spinToolCall is a response that asks for the get_diff tool (which succeeds and
|
||||
// ignores extra args), used to burn agent steps without producing a final
|
||||
// answer. The args vary by n so successive calls are not byte-identical — that
|
||||
// dodges the agent's same-call loop guard, exactly as a real reviewer making
|
||||
// distinct tool calls would.
|
||||
func spinToolCall(n int) fake.Step {
|
||||
return fake.ReplyWith(llm.Response{
|
||||
ToolCalls: []llm.ToolCall{{
|
||||
ID: "call",
|
||||
Name: "get_diff",
|
||||
Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
|
||||
}},
|
||||
FinishReason: llm.FinishToolCalls,
|
||||
Usage: llm.Usage{InputTokens: 1, OutputTokens: 1},
|
||||
})
|
||||
}
|
||||
|
||||
// lastUserText returns the text of the final message in the request, which is
|
||||
// what a fresh Generate call is reacting to.
|
||||
func lastUserText(req llm.Request) string {
|
||||
if len(req.Messages) == 0 {
|
||||
return ""
|
||||
}
|
||||
return req.Messages[len(req.Messages)-1].Text()
|
||||
}
|
||||
|
||||
// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
|
||||
// it is nudged to wrap up should still finish inside its budget — the steer
|
||||
// message arrives a few steps before the cap and the model writes its answer.
|
||||
func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
|
||||
t.Setenv("GADFLY_WRAPUP_RESERVE", "4")
|
||||
|
||||
final := "VERDICT: No material issues found."
|
||||
nudgeSeen := false
|
||||
n := 0
|
||||
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
||||
if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
|
||||
nudgeSeen = true
|
||||
return fake.Reply(final)
|
||||
}
|
||||
n++
|
||||
return spinToolCall(n)
|
||||
}))
|
||||
mdl, err := p.Model("mock")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
||||
|
||||
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
|
||||
if err != nil {
|
||||
t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
|
||||
}
|
||||
if out != final {
|
||||
t.Errorf("expected final review %q, got %q", final, out)
|
||||
}
|
||||
if !nudgeSeen {
|
||||
t.Error("the wrap-up nudge was never delivered to the model")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
|
||||
// spins on tools until the cap should NOT hard-fail — the tool-free finalization
|
||||
// pass forces a final answer out of the transcript.
|
||||
func TestRunAgent_FinalizationFallback(t *testing.T) {
|
||||
t.Setenv("GADFLY_WRAPUP_RESERVE", "2")
|
||||
|
||||
final := "VERDICT: Minor issues\n- something"
|
||||
forcedCalled := false
|
||||
n := 0
|
||||
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
||||
// Only the tool-free finalization pass forbids tools — reply there.
|
||||
if req.ToolChoice == "none" {
|
||||
forcedCalled = true
|
||||
return fake.Reply(final)
|
||||
}
|
||||
// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
|
||||
n++
|
||||
return spinToolCall(n)
|
||||
}))
|
||||
mdl, err := p.Model("mock")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
||||
|
||||
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
|
||||
if err != nil {
|
||||
t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
|
||||
}
|
||||
if !forcedCalled {
|
||||
t.Error("finalization fallback was never invoked")
|
||||
}
|
||||
if out != final {
|
||||
t.Errorf("expected forced final answer %q, got %q", final, out)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
|
||||
// yields nothing, runAgent surfaces an error rather than a phantom success.
|
||||
func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
|
||||
n := 0
|
||||
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
|
||||
if req.ToolChoice == "none" {
|
||||
return fake.Reply(" ") // finalization produces only whitespace
|
||||
}
|
||||
n++
|
||||
return spinToolCall(n)
|
||||
}))
|
||||
mdl, err := p.Model("mock")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
|
||||
|
||||
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
|
||||
t.Error("runAgent should error when the finalization fallback also yields no output")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWrapUpReserve(t *testing.T) {
|
||||
t.Setenv("GADFLY_WRAPUP_RESERVE", "")
|
||||
if got := wrapUpReserve(); got != defaultWrapUpReserve {
|
||||
t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
|
||||
}
|
||||
t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
|
||||
if got := wrapUpReserve(); got != 7 {
|
||||
t.Errorf("wrap-up reserve override = %d, want 7", got)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user