Gadfly: agentic adversarial PR reviewer (initial extraction)

Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in
Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/
find_files/get_diff), verifies findings before reporting, two-pass review +
adversarial recheck, posts one labeled comment per model. Advisory only.

- cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo
- entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML)
- Dockerfile: multi-stage; build-time module token never reaches the final image
- .gitea/workflows/build-image.yml: tag v* → build & push image
- examples/: ~15-line consumer stub
- system prompt genericized + hardened to re-derive constants/formulas (semantic bugs)

Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Steve Dudenhoeffer
2026-06-25 18:42:20 -04:00
commit c0d0152a34
18 changed files with 1879 additions and 0 deletions
+287
View File
@@ -0,0 +1,287 @@
// Command gadfly is the agentic backend for the PR adversarial-review
// workflow (.gitea/workflows/pr-adversarial-review.yml). Unlike the old
// one-shot chat call, it runs a tool-using agent (majordomo + Ollama Cloud)
// over the PR's CHECKED-OUT repository: the model can read_file / list_dir /
// grep / find_files / get_diff to VERIFY a finding before reporting it, which
// kills the "diff-only" false positives (claiming a missing import or a
// non-existent method it simply couldn't see).
//
// It is a pure producer of review text: it reads the diff + the repo and
// prints the review markdown to stdout. All Gitea I/O (fetching the diff,
// upserting the comment) stays in run.sh, so this binary needs no repo write
// access and is straightforward to unit-test.
//
// Two passes (unless the draft is a clean "no material issues" pass): a
// REVIEW pass produces a draft, then an adversarial RECHECK pass independently
// re-verifies every finding against the actual files with the same tools and
// drops the ones it cannot confirm, recomputing the verdict. This catches the
// "confident but wrong" findings that survive a single pass — e.g. claiming an
// env var is unset when a wrapper script sets it (see recheck.go).
//
// Inputs (env):
//
// OLLAMA_API_KEY Ollama Cloud bearer key (required).
// GADFLY_MODEL model id, e.g. "qwen3-coder:480b-cloud" (required).
// GADFLY_REPO_DIR path to the checked-out repo (required; the FS sandbox root).
// GADFLY_DIFF_FILE path to a file holding the full unified diff (required).
// GADFLY_SYSTEM_FILE path to the reviewer system prompt (required).
// GADFLY_TITLE PR title (optional).
// GADFLY_BODY PR description (optional).
// GADFLY_MAX_STEPS review-pass step cap (optional, default 24).
// GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to
// stop investigating and write its answer (optional,
// default 4). Plus a tool-free finalization fallback
// guarantees a step-exhausted pass still emits output.
// GADFLY_RECHECK set to 0/false to skip the recheck pass (optional, default on).
// GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16).
// GADFLY_TIMEOUT_SECS overall deadline in seconds, shared by both passes (optional, default 300).
// GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000;
// the full diff is always available via the get_diff tool).
//
// On success it prints the review to stdout and exits 0. On a usage/config or
// model error it prints a diagnostic to stderr and exits non-zero; run.sh then
// posts a "reviewer failed" notice (advisory — never fails the CI job).
package main
import (
"context"
"errors"
"fmt"
"os"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
)
const (
defaultMaxSteps = 24
defaultTimeoutSecs = 300
defaultMaxDiffChars = 60000
// defaultWrapUpReserve is how many steps before the cap the agent is told
// to stop investigating and write its final answer. Reserving a margin is
// what keeps a thorough reviewer from spending its whole budget on tool
// calls and then hard-failing with "max steps reached without a final
// answer" — it always has a few steps left to wrap up.
defaultWrapUpReserve = 4
)
// wrapUpInstruction is steered into a running agent once it comes within the
// wrap-up reserve of its step cap: a forceful nudge to stop calling tools and
// emit the final answer using only what it has already gathered.
const wrapUpInstruction = "⚠️ You are almost out of your investigation budget — only a few tool steps remain. " +
"STOP calling tools now and write your FINAL answer immediately, using only what you have already verified. " +
"Do not begin any new investigation. If a finding could not be confirmed, drop it or mark it explicitly as unverified. " +
"Output the review in the required format right now."
// finalizeInstruction is the user message sent on the tool-free fallback pass
// when the agent exhausted its budget (or tripped a loop guard) without ever
// producing a final answer. It forces the model to synthesize whatever it has.
const finalizeInstruction = "You have run out of investigation steps. Do NOT call any tools. " +
"Based solely on what you have already gathered above, write your final answer now in the required format. " +
"If you could not confirm some findings, omit them or mark them as unverified, but produce the answer."
func main() {
if err := run(); err != nil {
fmt.Fprintln(os.Stderr, "gadfly:", err)
os.Exit(1)
}
}
func run() error {
apiKey := os.Getenv("OLLAMA_API_KEY")
if apiKey == "" {
return errors.New("OLLAMA_API_KEY is required")
}
model := os.Getenv("GADFLY_MODEL")
repoDir := os.Getenv("GADFLY_REPO_DIR")
diffFile := os.Getenv("GADFLY_DIFF_FILE")
systemFile := os.Getenv("GADFLY_SYSTEM_FILE")
if model == "" || repoDir == "" || diffFile == "" || systemFile == "" {
return errors.New("GADFLY_MODEL, GADFLY_REPO_DIR, GADFLY_DIFF_FILE and GADFLY_SYSTEM_FILE are all required")
}
diffBytes, err := os.ReadFile(diffFile)
if err != nil {
return fmt.Errorf("read diff file: %w", err)
}
diff := string(diffBytes)
if strings.TrimSpace(diff) == "" {
return errors.New("empty diff; nothing to review")
}
systemBytes, err := os.ReadFile(systemFile)
if err != nil {
return fmt.Errorf("read system prompt: %w", err)
}
fsTools, err := newRepoFS(repoDir, diff)
if err != nil {
return err
}
mdl, err := ollama.Cloud(ollama.WithToken(apiKey)).Model(model)
if err != nil {
return fmt.Errorf("build model %q: %w", model, err)
}
timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// Pass 1 — review: produce the draft.
draft, err := runAgent(ctx, mdl, fsTools, string(systemBytes), buildTask(diff),
envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
if err != nil {
return fmt.Errorf("review pass: %w", err)
}
// Pass 2 — recheck: adversarially re-verify the draft's findings and drop
// the unconfirmed ones. Skipped for a clean draft (nothing to verify) or
// when disabled. A recheck failure is non-fatal — we emit the unverified
// draft rather than losing the review entirely.
final := draft
if shouldRecheck(draft) {
rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff),
envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps))
if rerr != nil {
fmt.Fprintln(os.Stderr, "gadfly: recheck pass failed; emitting unverified draft:", rerr)
} else {
final = rechecked
}
}
fmt.Println(final)
return nil
}
// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
// returns the final answer. An empty answer is an error — the caller decides
// whether that is fatal (review pass) or recoverable (recheck pass). A
// non-empty answer that ended on a budget/guard error is still returned: the
// model wrote its output, then ran out of steps.
//
// Two mechanisms keep a step-hungry model from hard-failing with no output:
// 1. A wrap-up steer: once the run comes within wrapUpReserve steps of the
// cap, a forceful "stop calling tools, write your final answer" message is
// injected so the model spends its remaining steps finalizing.
// 2. A finalization fallback: if the loop still ends empty (the model ignored
// the nudge, or a loop guard tripped), one tool-free model call forces a
// final answer out of the transcript already gathered.
func runAgent(ctx context.Context, mdl llm.Model, fsTools *repoFS, system, task string, maxSteps int) (string, error) {
box, err := fsTools.toolbox()
if err != nil {
return "", err
}
loop := agent.New(mdl, system,
agent.WithToolbox(box),
agent.WithMaxSteps(maxSteps),
// Guard rails: stop the model from spinning on failing or identical
// tool calls instead of writing its answer.
agent.WithToolErrorLimits(4, 4),
)
wrapUpAt := maxSteps - wrapUpReserve()
if wrapUpAt < 1 {
wrapUpAt = 1
}
var completed int // steps finished so far (updated after each step)
nudged := false
res, runErr := loop.Run(ctx, task,
agent.OnStep(func(s agent.Step) { completed = s.Index + 1 }),
agent.WithSteer(func() []llm.Message {
if !nudged && completed >= wrapUpAt {
nudged = true
return []llm.Message{llm.UserText(wrapUpInstruction)}
}
return nil
}),
)
out := ""
if res != nil {
out = strings.TrimSpace(res.Output)
}
if out != "" {
return out, nil
}
// No final answer. If we still have budget on the clock and a transcript to
// work from, force a tool-free finalization rather than losing the pass.
if res != nil && len(res.Messages) > 0 && ctx.Err() == nil {
if forced := forceFinalAnswer(ctx, mdl, system, res.Messages); forced != "" {
return forced, nil
}
}
if runErr != nil {
return "", runErr
}
return "", errors.New("agent produced no output")
}
// forceFinalAnswer makes one tool-free model call to squeeze a final answer out
// of an agent that exhausted its step budget without producing one. Tools are
// forbidden (ToolChoice "none") so the model must synthesize from the transcript
// instead of investigating further. Best-effort: any error or empty reply
// returns "" and the caller falls back to its normal empty-output handling.
func forceFinalAnswer(ctx context.Context, mdl llm.Model, system string, transcript []llm.Message) string {
msgs := append(append([]llm.Message(nil), transcript...), llm.UserText(finalizeInstruction))
resp, err := mdl.Generate(ctx, llm.Request{
System: system,
Messages: msgs,
ToolChoice: "none",
})
if err != nil || resp == nil {
return ""
}
return strings.TrimSpace(resp.Text())
}
// wrapUpReserve is how many steps before the cap the wrap-up nudge fires,
// overridable via GADFLY_WRAPUP_RESERVE.
func wrapUpReserve() int {
return envInt("GADFLY_WRAPUP_RESERVE", defaultWrapUpReserve)
}
// buildTask assembles the user message: PR metadata plus the unified diff,
// truncated for the prompt (the full diff stays available via get_diff).
func buildTask(diff string) string {
title := os.Getenv("GADFLY_TITLE")
body := os.Getenv("GADFLY_BODY")
maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
truncNote := ""
if maxDiff > 0 && len(diff) > maxDiff {
diff = diff[:maxDiff]
truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff)
}
var b strings.Builder
if title != "" {
fmt.Fprintf(&b, "PR title: %s\n\n", title)
}
if strings.TrimSpace(body) != "" {
fmt.Fprintf(&b, "PR description:\n%s\n\n", body)
}
b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n")
fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
return b.String()
}
// envInt reads an integer env var, falling back to def when unset or unparseable.
func envInt(name string, def int) int {
v := strings.TrimSpace(os.Getenv(name))
if v == "" {
return def
}
n, err := strconv.Atoi(v)
if err != nil || n <= 0 {
return def
}
return n
}
+97
View File
@@ -0,0 +1,97 @@
package main
import (
"fmt"
"os"
"strings"
)
// defaultRecheckMaxSteps bounds the verification pass. It is smaller than the
// review pass: re-checking a handful of existing findings needs fewer steps
// than discovering them.
const defaultRecheckMaxSteps = 16
// recheckSystemPrompt drives the second, adversarial verification pass. The
// model is given a DRAFT review and must independently confirm each finding
// against the real code before letting it survive — the antidote to a
// single-pass reviewer that reads a couple of files, mis-connects them, and
// posts a confident but wrong "blocking" verdict.
const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the
"mort" project (a large Go Discord bot). You are given a DRAFT review produced
by another model. Your job is NOT to write a new review — it is to confirm or
reject each finding in the draft against the ACTUAL code, then output the
corrected review.
You have the same read-only repository tools as the original reviewer:
- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path,
max_results]), find_files(name[, max_results]), get_diff().
For EVERY finding in the draft:
1. Independently reproduce the reasoning by reading the actual files with your
tools — do not trust the draft's claim, and do not trust the diff hunk alone.
2. KEEP the finding only if you can positively confirm it against the code.
3. DROP the finding if you cannot confirm it, or if the code contradicts it.
Watch especially for findings that ignore the "glue" around a change — the most
common false positive. Before keeping a claim that something is "missing",
"undefined", "never set", "not exported", or "won't compile", GREP THE WHOLE
REPO for it: the thing is very often satisfied in a place the original reviewer
didn't look — a shell script or Makefile that sets an env var, a CI YAML, an
adjacent file, generated code, or a wrapper that maps one name to another. A
finding that an env var X is unset is wrong if any script invokes the program
with "X=... prog". Check before you keep.
Output rules:
- Output the corrected review in the SAME format as the draft: a one-line
VERDICT ("No material issues found", "Minor issues", or "Blocking issues
found"), then the surviving findings as bullets with path:line and impact.
- Recompute the VERDICT from what SURVIVES. If every finding was dropped, the
verdict is "No material issues found".
- Do NOT invent new findings; this is a verification gate, not a fresh review.
- Do NOT include meta-commentary about the verification process or which
findings you dropped — output only the final, corrected review markdown.
- When done investigating, STOP calling tools and reply with the review.`
// recheckEnabled reports whether the verification pass should run. On unless
// GADFLY_RECHECK is explicitly a falsey value.
func recheckEnabled() bool {
switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) {
case "0", "false", "no", "off":
return false
default:
return true
}
}
// shouldRecheck decides whether to run the verification pass for a given draft.
// A clean "no material issues" draft has nothing to verify, so it is skipped
// even when rechecking is enabled — saving a whole model pass on clean PRs.
func shouldRecheck(draft string) bool {
if !recheckEnabled() {
return false
}
if strings.Contains(strings.ToLower(draft), "no material issues") {
return false
}
return true
}
// buildRecheckTask is the verification pass's user message: the draft review to
// scrutinize, with the full diff available via get_diff (and embedded here,
// truncated, to save a tool call).
func buildRecheckTask(draft, diff string) string {
maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
truncNote := ""
if maxDiff > 0 && len(diff) > maxDiff {
diff = diff[:maxDiff]
truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff)
}
var b strings.Builder
b.WriteString("Verify the following DRAFT review against the actual code, drop every finding you cannot confirm, and output the corrected review.\n\n")
b.WriteString("## Draft review\n\n")
b.WriteString(draft)
b.WriteString("\n\n## PR diff under review\n\n")
fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
return b.String()
}
+101
View File
@@ -0,0 +1,101 @@
package main
import (
"context"
"strings"
"testing"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)
func TestShouldRecheck(t *testing.T) {
t.Setenv("GADFLY_RECHECK", "") // default on
if shouldRecheck("VERDICT: Blocking issues found\n- something is wrong") != true {
t.Error("a draft with findings should be rechecked")
}
if shouldRecheck("No material issues found.") != false {
t.Error("a clean draft should skip recheck")
}
if shouldRecheck("### review\n\nNo material issues found.\n") != false {
t.Error("clean draft detection should be case/whitespace tolerant")
}
// Explicit disable wins even when there are findings.
t.Setenv("GADFLY_RECHECK", "0")
if shouldRecheck("Blocking issues found\n- x") != false {
t.Error("GADFLY_RECHECK=0 must disable recheck")
}
t.Setenv("GADFLY_RECHECK", "false")
if shouldRecheck("Blocking issues found\n- x") != false {
t.Error("GADFLY_RECHECK=false must disable recheck")
}
}
func TestRecheckEnabled(t *testing.T) {
for _, v := range []string{"", "1", "true", "yes", "anything"} {
t.Setenv("GADFLY_RECHECK", v)
if !recheckEnabled() {
t.Errorf("GADFLY_RECHECK=%q should be enabled", v)
}
}
for _, v := range []string{"0", "false", "no", "off", "OFF", " False "} {
t.Setenv("GADFLY_RECHECK", v)
if recheckEnabled() {
t.Errorf("GADFLY_RECHECK=%q should be disabled", v)
}
}
}
func TestBuildRecheckTask(t *testing.T) {
t.Setenv("GADFLY_MAX_DIFF_CHARS", "")
draft := "VERDICT: Blocking issues found\n- foo.go:1 broken"
out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n")
if !strings.Contains(out, draft) {
t.Error("recheck task must include the draft review")
}
if !strings.Contains(out, "Verify") || !strings.Contains(out, "drop every finding you cannot confirm") {
t.Errorf("recheck task missing the verify instruction:\n%s", out)
}
if !strings.Contains(out, "diff --git") {
t.Error("recheck task should include the diff")
}
}
// fakeModel builds a fake majordomo model that always replies with the given
// text (no tool calls), so the agent loop ends on its first step.
func fakeModel(t *testing.T, reply string) llm.Model {
t.Helper()
p := fake.New("fake", fake.WithDefault(func(string, llm.Request) fake.Step {
return fake.Reply(reply)
}))
m, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
return m
}
func TestRunAgent_ReturnsOutput(t *testing.T) {
fs, err := newRepoFS(t.TempDir(), "diff")
if err != nil {
t.Fatal(err)
}
mdl := fakeModel(t, " corrected review: No material issues found. ")
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4)
if err != nil {
t.Fatalf("runAgent: %v", err)
}
if out != "corrected review: No material issues found." {
t.Errorf("runAgent should return trimmed model output, got %q", out)
}
}
func TestRunAgent_EmptyIsError(t *testing.T) {
fs, _ := newRepoFS(t.TempDir(), "diff")
mdl := fakeModel(t, " ")
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
t.Error("runAgent should error on empty model output")
}
}
+388
View File
@@ -0,0 +1,388 @@
package main
import (
"bufio"
"context"
"fmt"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// Tool output bounds. The reviewer is a chat agent with a finite context, so
// every tool caps how much it can pull in one call — a runaway read_file or
// grep would blow the window and stall the loop.
const (
maxFileBytes = 64 * 1024 // per read_file call
maxReadLines = 800 // per read_file call
maxGrepResults = 200 // per grep call
maxFindResults = 200 // per find_files call
maxLineLen = 400 // truncate any single returned line to this
)
// skipDirs are never descended into by grep / find_files — noise and bulk that
// a code reviewer never needs and that would swamp the results.
var skipDirs = map[string]bool{
".git": true,
"node_modules": true,
"vendor": true,
}
// repoFS is a read-only, sandboxed view of the checked-out repository. Every
// path argument from the model is resolved against root and rejected if it
// escapes (symlink or `..` traversal), so a hostile diff can never make the
// reviewer read outside the checkout.
type repoFS struct {
root string // absolute, symlink-resolved repo root
diff string // the full PR unified diff (served by get_diff)
}
// newRepoFS resolves root to an absolute, symlink-free path.
func newRepoFS(root, diff string) (*repoFS, error) {
abs, err := filepath.Abs(root)
if err != nil {
return nil, fmt.Errorf("resolve repo dir: %w", err)
}
// EvalSymlinks so prefix containment checks survive a symlinked root
// (e.g. macOS /tmp -> /private/tmp).
if resolved, err := filepath.EvalSymlinks(abs); err == nil {
abs = resolved
}
info, err := os.Stat(abs)
if err != nil {
return nil, fmt.Errorf("repo dir %q: %w", root, err)
}
if !info.IsDir() {
return nil, fmt.Errorf("repo dir %q is not a directory", root)
}
return &repoFS{root: abs, diff: diff}, nil
}
// resolve maps a model-supplied relative path to an absolute path inside the
// sandbox, rejecting anything that escapes root. An empty path means root.
func (r *repoFS) resolve(rel string) (string, error) {
rel = strings.TrimSpace(rel)
rel = strings.TrimPrefix(rel, "./")
if rel == "" || rel == "." {
return r.root, nil
}
if filepath.IsAbs(rel) {
// Allow an absolute path only if it already points inside the sandbox.
clean := filepath.Clean(rel)
if err := r.contains(clean); err != nil {
return "", err
}
return clean, nil
}
joined := filepath.Clean(filepath.Join(r.root, rel))
if err := r.contains(joined); err != nil {
return "", err
}
return joined, nil
}
// contains verifies abs is root or lives beneath it.
func (r *repoFS) contains(abs string) error {
if abs == r.root {
return nil
}
if !strings.HasPrefix(abs, r.root+string(os.PathSeparator)) {
return fmt.Errorf("path escapes the repository sandbox")
}
return nil
}
// toolbox builds the read-only review toolbox over this sandbox.
func (r *repoFS) toolbox() (*llm.Toolbox, error) {
box := llm.NewToolbox("gadfly")
tools := []llm.Tool{
r.readFileTool(),
r.listDirTool(),
r.grepTool(),
r.findFilesTool(),
r.getDiffTool(),
}
for _, t := range tools {
if err := box.Add(t); err != nil {
return nil, fmt.Errorf("add tool %q: %w", t.Name, err)
}
}
return box, nil
}
type readFileArgs struct {
Path string `json:"path" description:"Repository-relative path of the file to read, e.g. pkg/logic/agentexec/pipeline.go"`
StartLine int `json:"start_line,omitempty" description:"Optional 1-based line to start from (default 1)."`
Limit int `json:"limit,omitempty" description:"Optional max number of lines to return (default/maximum 800)."`
}
func (r *repoFS) readFileTool() llm.Tool {
return llm.DefineTool[readFileArgs](
"read_file",
"Read a file from the repository at its current checked-out state, with line numbers. Use this to verify the surrounding code, imports, and symbols a diff hunk touches before reporting an issue.",
func(_ context.Context, args readFileArgs) (any, error) {
abs, err := r.resolve(args.Path)
if err != nil {
return nil, err
}
info, err := os.Stat(abs)
if err != nil {
return nil, fmt.Errorf("stat %q: %w", args.Path, err)
}
if info.IsDir() {
return nil, fmt.Errorf("%q is a directory; use list_dir", args.Path)
}
f, err := os.Open(abs)
if err != nil {
return nil, fmt.Errorf("open %q: %w", args.Path, err)
}
defer f.Close()
start := args.StartLine
if start < 1 {
start = 1
}
limit := args.Limit
if limit <= 0 || limit > maxReadLines {
limit = maxReadLines
}
var b strings.Builder
sc := bufio.NewScanner(f)
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
lineNo := 0
emitted := 0
for sc.Scan() {
lineNo++
if lineNo < start {
continue
}
if emitted >= limit || b.Len() >= maxFileBytes {
fmt.Fprintf(&b, "... (truncated at line %d; call read_file again with start_line=%d for more)\n", lineNo, lineNo)
break
}
line := sc.Text()
if len(line) > maxLineLen {
line = line[:maxLineLen] + "…"
}
fmt.Fprintf(&b, "%d\t%s\n", lineNo, line)
emitted++
}
if err := sc.Err(); err != nil {
return nil, fmt.Errorf("read %q: %w", args.Path, err)
}
if emitted == 0 {
return fmt.Sprintf("(%s has no lines at/after %d; file has %d lines)", args.Path, start, lineNo), nil
}
return b.String(), nil
},
)
}
type listDirArgs struct {
Path string `json:"path,omitempty" description:"Optional repository-relative directory (default: repo root)."`
}
func (r *repoFS) listDirTool() llm.Tool {
return llm.DefineTool[listDirArgs](
"list_dir",
"List the entries of a directory in the repository (directories marked with a trailing /). Use it to discover where code lives before reading.",
func(_ context.Context, args listDirArgs) (any, error) {
abs, err := r.resolve(args.Path)
if err != nil {
return nil, err
}
entries, err := os.ReadDir(abs)
if err != nil {
return nil, fmt.Errorf("list %q: %w", args.Path, err)
}
names := make([]string, 0, len(entries))
for _, e := range entries {
name := e.Name()
if e.IsDir() {
name += "/"
}
names = append(names, name)
}
sort.Strings(names)
if len(names) == 0 {
return "(empty directory)", nil
}
return strings.Join(names, "\n"), nil
},
)
}
type grepArgs struct {
Pattern string `json:"pattern" description:"A Go (RE2) regular expression to search for."`
Path string `json:"path,omitempty" description:"Optional repository-relative file or subdirectory to scope the search (default: whole repo)."`
MaxResults int `json:"max_results,omitempty" description:"Optional cap on matching lines returned (default/maximum 200)."`
}
func (r *repoFS) grepTool() llm.Tool {
return llm.DefineTool[grepArgs](
"grep",
"Search the repository's text files for a regular expression and return matching `path:line: text`. Use it to check whether a symbol, import, or call exists elsewhere before claiming a cross-file problem.",
func(_ context.Context, args grepArgs) (any, error) {
if strings.TrimSpace(args.Pattern) == "" {
return nil, fmt.Errorf("pattern is required")
}
re, err := regexp.Compile(args.Pattern)
if err != nil {
return nil, fmt.Errorf("invalid regexp: %w", err)
}
base, err := r.resolve(args.Path)
if err != nil {
return nil, err
}
limit := args.MaxResults
if limit <= 0 || limit > maxGrepResults {
limit = maxGrepResults
}
var out []string
truncated := false
walkErr := filepath.WalkDir(base, func(path string, d os.DirEntry, err error) error {
if err != nil {
return nil // skip unreadable entries
}
if d.IsDir() {
if skipDirs[d.Name()] && path != base {
return filepath.SkipDir
}
return nil
}
if len(out) >= limit {
truncated = true
return filepath.SkipAll
}
matchesInFile(path, r.root, re, limit, &out)
return nil
})
if walkErr != nil {
return nil, fmt.Errorf("search: %w", walkErr)
}
if len(out) > limit {
out = out[:limit]
truncated = true
}
if len(out) == 0 {
return "(no matches)", nil
}
res := strings.Join(out, "\n")
if truncated {
res += fmt.Sprintf("\n... (truncated at %d matches; narrow the pattern or path)", limit)
}
return res, nil
},
)
}
// matchesInFile appends "relpath:line: text" for each regexp match in a single
// text file, stopping once the global cap is reached. Binary files (NUL in the
// first chunk) and oversized files are skipped.
func matchesInFile(path, root string, re *regexp.Regexp, limit int, out *[]string) {
f, err := os.Open(path)
if err != nil {
return
}
defer f.Close()
rel, relErr := filepath.Rel(root, path)
if relErr != nil {
rel = path
}
sc := bufio.NewScanner(f)
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
lineNo := 0
for sc.Scan() {
if len(*out) >= limit {
return
}
lineNo++
line := sc.Text()
if lineNo == 1 && strings.IndexByte(line, 0) >= 0 {
return // looks binary
}
if re.MatchString(line) {
trimmed := strings.TrimSpace(line)
if len(trimmed) > maxLineLen {
trimmed = trimmed[:maxLineLen] + "…"
}
*out = append(*out, fmt.Sprintf("%s:%d: %s", rel, lineNo, trimmed))
}
}
}
type findFilesArgs struct {
Name string `json:"name" description:"Case-insensitive substring of the file path to match, e.g. \"pipeline.go\" or \"agentexec/\"."`
MaxResults int `json:"max_results,omitempty" description:"Optional cap on paths returned (default/maximum 200)."`
}
func (r *repoFS) findFilesTool() llm.Tool {
return llm.DefineTool[findFilesArgs](
"find_files",
"Find files whose repository-relative path contains a case-insensitive substring. Use it to locate a file by name when you don't know its directory.",
func(_ context.Context, args findFilesArgs) (any, error) {
needle := strings.ToLower(strings.TrimSpace(args.Name))
if needle == "" {
return nil, fmt.Errorf("name is required")
}
limit := args.MaxResults
if limit <= 0 || limit > maxFindResults {
limit = maxFindResults
}
var out []string
truncated := false
_ = filepath.WalkDir(r.root, func(path string, d os.DirEntry, err error) error {
if err != nil {
return nil
}
if d.IsDir() {
if skipDirs[d.Name()] && path != r.root {
return filepath.SkipDir
}
return nil
}
if len(out) >= limit {
truncated = true
return filepath.SkipAll
}
rel, relErr := filepath.Rel(r.root, path)
if relErr != nil {
return nil
}
if strings.Contains(strings.ToLower(rel), needle) {
out = append(out, rel)
}
return nil
})
sort.Strings(out)
if len(out) == 0 {
return "(no files matched)", nil
}
res := strings.Join(out, "\n")
if truncated {
res += fmt.Sprintf("\n... (truncated at %d files; narrow the name)", limit)
}
return res, nil
},
)
}
func (r *repoFS) getDiffTool() llm.Tool {
return llm.DefineTool[struct{}](
"get_diff",
"Return the complete unified diff under review. The diff is also included (possibly truncated) in the task message; call this to get the full, untruncated text.",
func(_ context.Context, _ struct{}) (any, error) {
if strings.TrimSpace(r.diff) == "" {
return "(empty diff)", nil
}
return r.diff, nil
},
)
}
+243
View File
@@ -0,0 +1,243 @@
package main
import (
"context"
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
)
// buildFixtureRepo lays down a small repo tree for the toolbox tests and
// returns its root.
func buildFixtureRepo(t *testing.T) string {
t.Helper()
root := t.TempDir()
write := func(rel, content string) {
p := filepath.Join(root, rel)
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(p, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
}
write("pkg/foo/foo.go", "package foo\n\nfunc Hello() string {\n\treturn \"hi\"\n}\n")
write("pkg/foo/bar.go", "package foo\n\n// TODO: refactor\nvar Answer = 42\n")
write("README.md", "# Fixture\n\nHello world.\n")
write(".git/config", "[core]\n\tbare = false\n") // must be skipped by grep/find
write("secret.txt", "this file lives at the repo root\n")
return root
}
// call invokes a tool from the sandbox's toolbox by name with JSON args and
// returns the result string (or the error).
func call(t *testing.T, fs *repoFS, name string, args map[string]any) (string, error) {
t.Helper()
box, err := fs.toolbox()
if err != nil {
t.Fatalf("toolbox: %v", err)
}
tool, ok := box.Get(name)
if !ok {
t.Fatalf("tool %q not in toolbox", name)
}
raw, err := json.Marshal(args)
if err != nil {
t.Fatal(err)
}
out, herr := tool.Handler(context.Background(), raw)
if herr != nil {
return "", herr
}
s, _ := out.(string)
return s, nil
}
func TestRepoFS_ResolveSandbox(t *testing.T) {
root := buildFixtureRepo(t)
fs, err := newRepoFS(root, "")
if err != nil {
t.Fatalf("newRepoFS: %v", err)
}
// In-bounds paths resolve.
if _, err := fs.resolve("pkg/foo/foo.go"); err != nil {
t.Errorf("in-bounds path rejected: %v", err)
}
if got, err := fs.resolve(""); err != nil || got != fs.root {
t.Errorf("empty path should be root: got %q err %v", got, err)
}
// Escapes are rejected.
for _, bad := range []string{"../outside", "../../etc/passwd", "pkg/../../escape", "/etc/passwd"} {
if _, err := fs.resolve(bad); err == nil {
t.Errorf("path %q escaped the sandbox but was allowed", bad)
}
}
}
func TestReadFileTool(t *testing.T) {
root := buildFixtureRepo(t)
fs, _ := newRepoFS(root, "")
out, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go"})
if err != nil {
t.Fatalf("read_file: %v", err)
}
if !strings.Contains(out, "func Hello()") {
t.Errorf("expected file body, got:\n%s", out)
}
if !strings.Contains(out, "1\t") {
t.Errorf("expected line numbers, got:\n%s", out)
}
// Line slicing.
out, err = call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go", "start_line": 3, "limit": 1})
if err != nil {
t.Fatalf("read_file slice: %v", err)
}
if !strings.Contains(out, "func Hello()") || strings.Contains(out, "package foo") {
t.Errorf("slice should start at line 3 only, got:\n%s", out)
}
// Reading a directory is an error directing to list_dir.
if _, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo"}); err == nil {
t.Error("reading a directory should error")
}
// Escape is rejected.
if _, err := call(t, fs, "read_file", map[string]any{"path": "../escape"}); err == nil {
t.Error("read_file should reject sandbox escape")
}
}
func TestListDirTool(t *testing.T) {
root := buildFixtureRepo(t)
fs, _ := newRepoFS(root, "")
out, err := call(t, fs, "list_dir", map[string]any{"path": "pkg/foo"})
if err != nil {
t.Fatalf("list_dir: %v", err)
}
for _, want := range []string{"foo.go", "bar.go"} {
if !strings.Contains(out, want) {
t.Errorf("list_dir missing %q in:\n%s", want, out)
}
}
// Root listing marks directories with a trailing slash.
out, _ = call(t, fs, "list_dir", map[string]any{})
if !strings.Contains(out, "pkg/") {
t.Errorf("expected pkg/ (dir with trailing slash) in root listing:\n%s", out)
}
}
func TestGrepTool(t *testing.T) {
root := buildFixtureRepo(t)
fs, _ := newRepoFS(root, "")
out, err := call(t, fs, "grep", map[string]any{"pattern": "func Hello"})
if err != nil {
t.Fatalf("grep: %v", err)
}
if !strings.Contains(out, "pkg/foo/foo.go:") {
t.Errorf("grep should locate the func, got:\n%s", out)
}
// .git is skipped.
out, _ = call(t, fs, "grep", map[string]any{"pattern": "bare = false"})
if strings.Contains(out, ".git/") {
t.Errorf("grep must not descend into .git, got:\n%s", out)
}
// No matches is a clean message, not an error.
out, err = call(t, fs, "grep", map[string]any{"pattern": "zzz_no_such_token_zzz"})
if err != nil || !strings.Contains(out, "no matches") {
t.Errorf("expected clean no-match, got %q err %v", out, err)
}
// Invalid regexp surfaces as an error.
if _, err := call(t, fs, "grep", map[string]any{"pattern": "([unterminated"}); err == nil {
t.Error("invalid regexp should error")
}
// Scoped grep honors the path.
out, _ = call(t, fs, "grep", map[string]any{"pattern": "Answer", "path": "pkg/foo/bar.go"})
if !strings.Contains(out, "bar.go:") {
t.Errorf("scoped grep missed the match:\n%s", out)
}
}
func TestFindFilesTool(t *testing.T) {
root := buildFixtureRepo(t)
fs, _ := newRepoFS(root, "")
out, err := call(t, fs, "find_files", map[string]any{"name": "foo.go"})
if err != nil {
t.Fatalf("find_files: %v", err)
}
if !strings.Contains(out, "pkg/foo/foo.go") {
t.Errorf("find_files missed foo.go:\n%s", out)
}
// Case-insensitive substring on the path.
out, _ = call(t, fs, "find_files", map[string]any{"name": "PKG/FOO"})
if !strings.Contains(out, "pkg/foo/") {
t.Errorf("find_files should be case-insensitive on the path:\n%s", out)
}
// .git entries are not surfaced.
out, _ = call(t, fs, "find_files", map[string]any{"name": "config"})
if strings.Contains(out, ".git/") {
t.Errorf("find_files must skip .git, got:\n%s", out)
}
}
func TestGetDiffTool(t *testing.T) {
root := buildFixtureRepo(t)
const diff = "diff --git a/x b/x\n+added line\n"
fs, _ := newRepoFS(root, diff)
out, err := call(t, fs, "get_diff", map[string]any{})
if err != nil {
t.Fatalf("get_diff: %v", err)
}
if out != diff {
t.Errorf("get_diff returned %q, want %q", out, diff)
}
}
func TestNewRepoFS_BadRoot(t *testing.T) {
// A file (not a directory) is rejected.
f := filepath.Join(t.TempDir(), "afile")
if err := os.WriteFile(f, []byte("x"), 0o644); err != nil {
t.Fatal(err)
}
if _, err := newRepoFS(f, ""); err == nil {
t.Error("newRepoFS should reject a non-directory root")
}
if _, err := newRepoFS(filepath.Join(t.TempDir(), "missing"), ""); err == nil {
t.Error("newRepoFS should reject a missing root")
}
}
// Ensure the toolbox exposes exactly the expected tools (guards against an
// accidental rename breaking the system prompt's tool references).
func TestToolbox_Names(t *testing.T) {
fs, _ := newRepoFS(t.TempDir(), "")
box, err := fs.toolbox()
if err != nil {
t.Fatalf("toolbox: %v", err)
}
got := map[string]bool{}
for _, tl := range box.Tools() {
got[tl.Name] = true
}
for _, want := range []string{"read_file", "list_dir", "grep", "find_files", "get_diff"} {
if !got[want] {
t.Errorf("toolbox missing tool %q", want)
}
}
}
+143
View File
@@ -0,0 +1,143 @@
package main
import (
"context"
"encoding/json"
"fmt"
"strings"
"testing"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
)
// spinToolCall is a response that asks for the get_diff tool (which succeeds and
// ignores extra args), used to burn agent steps without producing a final
// answer. The args vary by n so successive calls are not byte-identical — that
// dodges the agent's same-call loop guard, exactly as a real reviewer making
// distinct tool calls would.
func spinToolCall(n int) fake.Step {
return fake.ReplyWith(llm.Response{
ToolCalls: []llm.ToolCall{{
ID: "call",
Name: "get_diff",
Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
}},
FinishReason: llm.FinishToolCalls,
Usage: llm.Usage{InputTokens: 1, OutputTokens: 1},
})
}
// lastUserText returns the text of the final message in the request, which is
// what a fresh Generate call is reacting to.
func lastUserText(req llm.Request) string {
if len(req.Messages) == 0 {
return ""
}
return req.Messages[len(req.Messages)-1].Text()
}
// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
// it is nudged to wrap up should still finish inside its budget — the steer
// message arrives a few steps before the cap and the model writes its answer.
func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "4")
final := "VERDICT: No material issues found."
nudgeSeen := false
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
nudgeSeen = true
return fake.Reply(final)
}
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
if err != nil {
t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
}
if out != final {
t.Errorf("expected final review %q, got %q", final, out)
}
if !nudgeSeen {
t.Error("the wrap-up nudge was never delivered to the model")
}
}
// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
// spins on tools until the cap should NOT hard-fail — the tool-free finalization
// pass forces a final answer out of the transcript.
func TestRunAgent_FinalizationFallback(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "2")
final := "VERDICT: Minor issues\n- something"
forcedCalled := false
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
// Only the tool-free finalization pass forbids tools — reply there.
if req.ToolChoice == "none" {
forcedCalled = true
return fake.Reply(final)
}
// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
if err != nil {
t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
}
if !forcedCalled {
t.Error("finalization fallback was never invoked")
}
if out != final {
t.Errorf("expected forced final answer %q, got %q", final, out)
}
}
// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
// yields nothing, runAgent surfaces an error rather than a phantom success.
func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
n := 0
p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
if req.ToolChoice == "none" {
return fake.Reply(" ") // finalization produces only whitespace
}
n++
return spinToolCall(n)
}))
mdl, err := p.Model("mock")
if err != nil {
t.Fatal(err)
}
fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
t.Error("runAgent should error when the finalization fallback also yields no output")
}
}
func TestWrapUpReserve(t *testing.T) {
t.Setenv("GADFLY_WRAPUP_RESERVE", "")
if got := wrapUpReserve(); got != defaultWrapUpReserve {
t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
}
t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
if got := wrapUpReserve(); got != 7 {
t.Errorf("wrap-up reserve override = %d, want 7", got)
}
}