Gadfly: agentic adversarial PR reviewer (initial extraction)

Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/ find_files/get_diff), verifies findings before reporting, two-pass review + adversarial recheck, posts one labeled comment per model. Advisory only. - cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo - entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML) - Dockerfile: multi-stage; build-time module token never reaches the final image - .gitea/workflows/build-image.yml: tag v* → build & push image - examples/: ~15-line consumer stub - system prompt genericized + hardened to re-derive constants/formulas (semantic bugs) Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-25 18:42:20 -04:00
commit c0d0152a34
18 changed files with 1879 additions and 0 deletions
@@ -0,0 +1,287 @@
+// Command gadfly is the agentic backend for the PR adversarial-review
+// workflow (.gitea/workflows/pr-adversarial-review.yml). Unlike the old
+// one-shot chat call, it runs a tool-using agent (majordomo + Ollama Cloud)
+// over the PR's CHECKED-OUT repository: the model can read_file / list_dir /
+// grep / find_files / get_diff to VERIFY a finding before reporting it, which
+// kills the "diff-only" false positives (claiming a missing import or a
+// non-existent method it simply couldn't see).
+//
+// It is a pure producer of review text: it reads the diff + the repo and
+// prints the review markdown to stdout. All Gitea I/O (fetching the diff,
+// upserting the comment) stays in run.sh, so this binary needs no repo write
+// access and is straightforward to unit-test.
+//
+// Two passes (unless the draft is a clean "no material issues" pass): a
+// REVIEW pass produces a draft, then an adversarial RECHECK pass independently
+// re-verifies every finding against the actual files with the same tools and
+// drops the ones it cannot confirm, recomputing the verdict. This catches the
+// "confident but wrong" findings that survive a single pass — e.g. claiming an
+// env var is unset when a wrapper script sets it (see recheck.go).
+//
+// Inputs (env):
+//
+//	OLLAMA_API_KEY          Ollama Cloud bearer key (required).
+//	GADFLY_MODEL          model id, e.g. "qwen3-coder:480b-cloud" (required).
+//	GADFLY_REPO_DIR       path to the checked-out repo (required; the FS sandbox root).
+//	GADFLY_DIFF_FILE      path to a file holding the full unified diff (required).
+//	GADFLY_SYSTEM_FILE    path to the reviewer system prompt (required).
+//	GADFLY_TITLE          PR title (optional).
+//	GADFLY_BODY           PR description (optional).
+//	GADFLY_MAX_STEPS      review-pass step cap (optional, default 24).
+//	GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to
+//	                        stop investigating and write its answer (optional,
+//	                        default 4). Plus a tool-free finalization fallback
+//	                        guarantees a step-exhausted pass still emits output.
+//	GADFLY_RECHECK        set to 0/false to skip the recheck pass (optional, default on).
+//	GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16).
+//	GADFLY_TIMEOUT_SECS   overall deadline in seconds, shared by both passes (optional, default 300).
+//	GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000;
+//	                        the full diff is always available via the get_diff tool).
+//
+// On success it prints the review to stdout and exits 0. On a usage/config or
+// model error it prints a diagnostic to stderr and exits non-zero; run.sh then
+// posts a "reviewer failed" notice (advisory — never fails the CI job).
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
+)
+
+const (
+	defaultMaxSteps     = 24
+	defaultTimeoutSecs  = 300
+	defaultMaxDiffChars = 60000
+	// defaultWrapUpReserve is how many steps before the cap the agent is told
+	// to stop investigating and write its final answer. Reserving a margin is
+	// what keeps a thorough reviewer from spending its whole budget on tool
+	// calls and then hard-failing with "max steps reached without a final
+	// answer" — it always has a few steps left to wrap up.
+	defaultWrapUpReserve = 4
+)
+
+// wrapUpInstruction is steered into a running agent once it comes within the
+// wrap-up reserve of its step cap: a forceful nudge to stop calling tools and
+// emit the final answer using only what it has already gathered.
+const wrapUpInstruction = "⚠️ You are almost out of your investigation budget — only a few tool steps remain. " +
+	"STOP calling tools now and write your FINAL answer immediately, using only what you have already verified. " +
+	"Do not begin any new investigation. If a finding could not be confirmed, drop it or mark it explicitly as unverified. " +
+	"Output the review in the required format right now."
+
+// finalizeInstruction is the user message sent on the tool-free fallback pass
+// when the agent exhausted its budget (or tripped a loop guard) without ever
+// producing a final answer. It forces the model to synthesize whatever it has.
+const finalizeInstruction = "You have run out of investigation steps. Do NOT call any tools. " +
+	"Based solely on what you have already gathered above, write your final answer now in the required format. " +
+	"If you could not confirm some findings, omit them or mark them as unverified, but produce the answer."
+
+func main() {
+	if err := run(); err != nil {
+		fmt.Fprintln(os.Stderr, "gadfly:", err)
+		os.Exit(1)
+	}
+}
+
+func run() error {
+	apiKey := os.Getenv("OLLAMA_API_KEY")
+	if apiKey == "" {
+		return errors.New("OLLAMA_API_KEY is required")
+	}
+	model := os.Getenv("GADFLY_MODEL")
+	repoDir := os.Getenv("GADFLY_REPO_DIR")
+	diffFile := os.Getenv("GADFLY_DIFF_FILE")
+	systemFile := os.Getenv("GADFLY_SYSTEM_FILE")
+	if model == "" || repoDir == "" || diffFile == "" || systemFile == "" {
+		return errors.New("GADFLY_MODEL, GADFLY_REPO_DIR, GADFLY_DIFF_FILE and GADFLY_SYSTEM_FILE are all required")
+	}
+
+	diffBytes, err := os.ReadFile(diffFile)
+	if err != nil {
+		return fmt.Errorf("read diff file: %w", err)
+	}
+	diff := string(diffBytes)
+	if strings.TrimSpace(diff) == "" {
+		return errors.New("empty diff; nothing to review")
+	}
+
+	systemBytes, err := os.ReadFile(systemFile)
+	if err != nil {
+		return fmt.Errorf("read system prompt: %w", err)
+	}
+
+	fsTools, err := newRepoFS(repoDir, diff)
+	if err != nil {
+		return err
+	}
+
+	mdl, err := ollama.Cloud(ollama.WithToken(apiKey)).Model(model)
+	if err != nil {
+		return fmt.Errorf("build model %q: %w", model, err)
+	}
+
+	timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+
+	// Pass 1 — review: produce the draft.
+	draft, err := runAgent(ctx, mdl, fsTools, string(systemBytes), buildTask(diff),
+		envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
+	if err != nil {
+		return fmt.Errorf("review pass: %w", err)
+	}
+
+	// Pass 2 — recheck: adversarially re-verify the draft's findings and drop
+	// the unconfirmed ones. Skipped for a clean draft (nothing to verify) or
+	// when disabled. A recheck failure is non-fatal — we emit the unverified
+	// draft rather than losing the review entirely.
+	final := draft
+	if shouldRecheck(draft) {
+		rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff),
+			envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps))
+		if rerr != nil {
+			fmt.Fprintln(os.Stderr, "gadfly: recheck pass failed; emitting unverified draft:", rerr)
+		} else {
+			final = rechecked
+		}
+	}
+
+	fmt.Println(final)
+	return nil
+}
+
+// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
+// returns the final answer. An empty answer is an error — the caller decides
+// whether that is fatal (review pass) or recoverable (recheck pass). A
+// non-empty answer that ended on a budget/guard error is still returned: the
+// model wrote its output, then ran out of steps.
+//
+// Two mechanisms keep a step-hungry model from hard-failing with no output:
+//  1. A wrap-up steer: once the run comes within wrapUpReserve steps of the
+//     cap, a forceful "stop calling tools, write your final answer" message is
+//     injected so the model spends its remaining steps finalizing.
+//  2. A finalization fallback: if the loop still ends empty (the model ignored
+//     the nudge, or a loop guard tripped), one tool-free model call forces a
+//     final answer out of the transcript already gathered.
+func runAgent(ctx context.Context, mdl llm.Model, fsTools *repoFS, system, task string, maxSteps int) (string, error) {
+	box, err := fsTools.toolbox()
+	if err != nil {
+		return "", err
+	}
+	loop := agent.New(mdl, system,
+		agent.WithToolbox(box),
+		agent.WithMaxSteps(maxSteps),
+		// Guard rails: stop the model from spinning on failing or identical
+		// tool calls instead of writing its answer.
+		agent.WithToolErrorLimits(4, 4),
+	)
+
+	wrapUpAt := maxSteps - wrapUpReserve()
+	if wrapUpAt < 1 {
+		wrapUpAt = 1
+	}
+	var completed int // steps finished so far (updated after each step)
+	nudged := false
+
+	res, runErr := loop.Run(ctx, task,
+		agent.OnStep(func(s agent.Step) { completed = s.Index + 1 }),
+		agent.WithSteer(func() []llm.Message {
+			if !nudged && completed >= wrapUpAt {
+				nudged = true
+				return []llm.Message{llm.UserText(wrapUpInstruction)}
+			}
+			return nil
+		}),
+	)
+
+	out := ""
+	if res != nil {
+		out = strings.TrimSpace(res.Output)
+	}
+	if out != "" {
+		return out, nil
+	}
+
+	// No final answer. If we still have budget on the clock and a transcript to
+	// work from, force a tool-free finalization rather than losing the pass.
+	if res != nil && len(res.Messages) > 0 && ctx.Err() == nil {
+		if forced := forceFinalAnswer(ctx, mdl, system, res.Messages); forced != "" {
+			return forced, nil
+		}
+	}
+
+	if runErr != nil {
+		return "", runErr
+	}
+	return "", errors.New("agent produced no output")
+}
+
+// forceFinalAnswer makes one tool-free model call to squeeze a final answer out
+// of an agent that exhausted its step budget without producing one. Tools are
+// forbidden (ToolChoice "none") so the model must synthesize from the transcript
+// instead of investigating further. Best-effort: any error or empty reply
+// returns "" and the caller falls back to its normal empty-output handling.
+func forceFinalAnswer(ctx context.Context, mdl llm.Model, system string, transcript []llm.Message) string {
+	msgs := append(append([]llm.Message(nil), transcript...), llm.UserText(finalizeInstruction))
+	resp, err := mdl.Generate(ctx, llm.Request{
+		System:     system,
+		Messages:   msgs,
+		ToolChoice: "none",
+	})
+	if err != nil || resp == nil {
+		return ""
+	}
+	return strings.TrimSpace(resp.Text())
+}
+
+// wrapUpReserve is how many steps before the cap the wrap-up nudge fires,
+// overridable via GADFLY_WRAPUP_RESERVE.
+func wrapUpReserve() int {
+	return envInt("GADFLY_WRAPUP_RESERVE", defaultWrapUpReserve)
+}
+
+// buildTask assembles the user message: PR metadata plus the unified diff,
+// truncated for the prompt (the full diff stays available via get_diff).
+func buildTask(diff string) string {
+	title := os.Getenv("GADFLY_TITLE")
+	body := os.Getenv("GADFLY_BODY")
+
+	maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
+	truncNote := ""
+	if maxDiff > 0 && len(diff) > maxDiff {
+		diff = diff[:maxDiff]
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff)
+	}
+
+	var b strings.Builder
+	if title != "" {
+		fmt.Fprintf(&b, "PR title: %s\n\n", title)
+	}
+	if strings.TrimSpace(body) != "" {
+		fmt.Fprintf(&b, "PR description:\n%s\n\n", body)
+	}
+	b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n")
+	fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
+	return b.String()
+}
+
+// envInt reads an integer env var, falling back to def when unset or unparseable.
+func envInt(name string, def int) int {
+	v := strings.TrimSpace(os.Getenv(name))
+	if v == "" {
+		return def
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil || n <= 0 {
+		return def
+	}
+	return n
+}
@@ -0,0 +1,97 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"strings"
+)
+
+// defaultRecheckMaxSteps bounds the verification pass. It is smaller than the
+// review pass: re-checking a handful of existing findings needs fewer steps
+// than discovering them.
+const defaultRecheckMaxSteps = 16
+
+// recheckSystemPrompt drives the second, adversarial verification pass. The
+// model is given a DRAFT review and must independently confirm each finding
+// against the real code before letting it survive — the antidote to a
+// single-pass reviewer that reads a couple of files, mis-connects them, and
+// posts a confident but wrong "blocking" verdict.
+const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the
+"mort" project (a large Go Discord bot). You are given a DRAFT review produced
+by another model. Your job is NOT to write a new review — it is to confirm or
+reject each finding in the draft against the ACTUAL code, then output the
+corrected review.
+
+You have the same read-only repository tools as the original reviewer:
+- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path,
+  max_results]), find_files(name[, max_results]), get_diff().
+
+For EVERY finding in the draft:
+1. Independently reproduce the reasoning by reading the actual files with your
+   tools — do not trust the draft's claim, and do not trust the diff hunk alone.
+2. KEEP the finding only if you can positively confirm it against the code.
+3. DROP the finding if you cannot confirm it, or if the code contradicts it.
+
+Watch especially for findings that ignore the "glue" around a change — the most
+common false positive. Before keeping a claim that something is "missing",
+"undefined", "never set", "not exported", or "won't compile", GREP THE WHOLE
+REPO for it: the thing is very often satisfied in a place the original reviewer
+didn't look — a shell script or Makefile that sets an env var, a CI YAML, an
+adjacent file, generated code, or a wrapper that maps one name to another. A
+finding that an env var X is unset is wrong if any script invokes the program
+with "X=... prog". Check before you keep.
+
+Output rules:
+- Output the corrected review in the SAME format as the draft: a one-line
+  VERDICT ("No material issues found", "Minor issues", or "Blocking issues
+  found"), then the surviving findings as bullets with path:line and impact.
+- Recompute the VERDICT from what SURVIVES. If every finding was dropped, the
+  verdict is "No material issues found".
+- Do NOT invent new findings; this is a verification gate, not a fresh review.
+- Do NOT include meta-commentary about the verification process or which
+  findings you dropped — output only the final, corrected review markdown.
+- When done investigating, STOP calling tools and reply with the review.`
+
+// recheckEnabled reports whether the verification pass should run. On unless
+// GADFLY_RECHECK is explicitly a falsey value.
+func recheckEnabled() bool {
+	switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) {
+	case "0", "false", "no", "off":
+		return false
+	default:
+		return true
+	}
+}
+
+// shouldRecheck decides whether to run the verification pass for a given draft.
+// A clean "no material issues" draft has nothing to verify, so it is skipped
+// even when rechecking is enabled — saving a whole model pass on clean PRs.
+func shouldRecheck(draft string) bool {
+	if !recheckEnabled() {
+		return false
+	}
+	if strings.Contains(strings.ToLower(draft), "no material issues") {
+		return false
+	}
+	return true
+}
+
+// buildRecheckTask is the verification pass's user message: the draft review to
+// scrutinize, with the full diff available via get_diff (and embedded here,
+// truncated, to save a tool call).
+func buildRecheckTask(draft, diff string) string {
+	maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
+	truncNote := ""
+	if maxDiff > 0 && len(diff) > maxDiff {
+		diff = diff[:maxDiff]
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff)
+	}
+
+	var b strings.Builder
+	b.WriteString("Verify the following DRAFT review against the actual code, drop every finding you cannot confirm, and output the corrected review.\n\n")
+	b.WriteString("## Draft review\n\n")
+	b.WriteString(draft)
+	b.WriteString("\n\n## PR diff under review\n\n")
+	fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
+	return b.String()
+}
@@ -0,0 +1,101 @@
+package main
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+)
+
+func TestShouldRecheck(t *testing.T) {
+	t.Setenv("GADFLY_RECHECK", "") // default on
+
+	if shouldRecheck("VERDICT: Blocking issues found\n- something is wrong") != true {
+		t.Error("a draft with findings should be rechecked")
+	}
+	if shouldRecheck("No material issues found.") != false {
+		t.Error("a clean draft should skip recheck")
+	}
+	if shouldRecheck("### review\n\nNo material issues found.\n") != false {
+		t.Error("clean draft detection should be case/whitespace tolerant")
+	}
+
+	// Explicit disable wins even when there are findings.
+	t.Setenv("GADFLY_RECHECK", "0")
+	if shouldRecheck("Blocking issues found\n- x") != false {
+		t.Error("GADFLY_RECHECK=0 must disable recheck")
+	}
+	t.Setenv("GADFLY_RECHECK", "false")
+	if shouldRecheck("Blocking issues found\n- x") != false {
+		t.Error("GADFLY_RECHECK=false must disable recheck")
+	}
+}
+
+func TestRecheckEnabled(t *testing.T) {
+	for _, v := range []string{"", "1", "true", "yes", "anything"} {
+		t.Setenv("GADFLY_RECHECK", v)
+		if !recheckEnabled() {
+			t.Errorf("GADFLY_RECHECK=%q should be enabled", v)
+		}
+	}
+	for _, v := range []string{"0", "false", "no", "off", "OFF", " False "} {
+		t.Setenv("GADFLY_RECHECK", v)
+		if recheckEnabled() {
+			t.Errorf("GADFLY_RECHECK=%q should be disabled", v)
+		}
+	}
+}
+
+func TestBuildRecheckTask(t *testing.T) {
+	t.Setenv("GADFLY_MAX_DIFF_CHARS", "")
+	draft := "VERDICT: Blocking issues found\n- foo.go:1 broken"
+	out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n")
+	if !strings.Contains(out, draft) {
+		t.Error("recheck task must include the draft review")
+	}
+	if !strings.Contains(out, "Verify") || !strings.Contains(out, "drop every finding you cannot confirm") {
+		t.Errorf("recheck task missing the verify instruction:\n%s", out)
+	}
+	if !strings.Contains(out, "diff --git") {
+		t.Error("recheck task should include the diff")
+	}
+}
+
+// fakeModel builds a fake majordomo model that always replies with the given
+// text (no tool calls), so the agent loop ends on its first step.
+func fakeModel(t *testing.T, reply string) llm.Model {
+	t.Helper()
+	p := fake.New("fake", fake.WithDefault(func(string, llm.Request) fake.Step {
+		return fake.Reply(reply)
+	}))
+	m, err := p.Model("mock")
+	if err != nil {
+		t.Fatal(err)
+	}
+	return m
+}
+
+func TestRunAgent_ReturnsOutput(t *testing.T) {
+	fs, err := newRepoFS(t.TempDir(), "diff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	mdl := fakeModel(t, "  corrected review: No material issues found.  ")
+	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4)
+	if err != nil {
+		t.Fatalf("runAgent: %v", err)
+	}
+	if out != "corrected review: No material issues found." {
+		t.Errorf("runAgent should return trimmed model output, got %q", out)
+	}
+}
+
+func TestRunAgent_EmptyIsError(t *testing.T) {
+	fs, _ := newRepoFS(t.TempDir(), "diff")
+	mdl := fakeModel(t, "   ")
+	if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
+		t.Error("runAgent should error on empty model output")
+	}
+}
@@ -0,0 +1,388 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// Tool output bounds. The reviewer is a chat agent with a finite context, so
+// every tool caps how much it can pull in one call — a runaway read_file or
+// grep would blow the window and stall the loop.
+const (
+	maxFileBytes   = 64 * 1024 // per read_file call
+	maxReadLines   = 800       // per read_file call
+	maxGrepResults = 200       // per grep call
+	maxFindResults = 200       // per find_files call
+	maxLineLen     = 400       // truncate any single returned line to this
+)
+
+// skipDirs are never descended into by grep / find_files — noise and bulk that
+// a code reviewer never needs and that would swamp the results.
+var skipDirs = map[string]bool{
+	".git":         true,
+	"node_modules": true,
+	"vendor":       true,
+}
+
+// repoFS is a read-only, sandboxed view of the checked-out repository. Every
+// path argument from the model is resolved against root and rejected if it
+// escapes (symlink or `..` traversal), so a hostile diff can never make the
+// reviewer read outside the checkout.
+type repoFS struct {
+	root string // absolute, symlink-resolved repo root
+	diff string // the full PR unified diff (served by get_diff)
+}
+
+// newRepoFS resolves root to an absolute, symlink-free path.
+func newRepoFS(root, diff string) (*repoFS, error) {
+	abs, err := filepath.Abs(root)
+	if err != nil {
+		return nil, fmt.Errorf("resolve repo dir: %w", err)
+	}
+	// EvalSymlinks so prefix containment checks survive a symlinked root
+	// (e.g. macOS /tmp -> /private/tmp).
+	if resolved, err := filepath.EvalSymlinks(abs); err == nil {
+		abs = resolved
+	}
+	info, err := os.Stat(abs)
+	if err != nil {
+		return nil, fmt.Errorf("repo dir %q: %w", root, err)
+	}
+	if !info.IsDir() {
+		return nil, fmt.Errorf("repo dir %q is not a directory", root)
+	}
+	return &repoFS{root: abs, diff: diff}, nil
+}
+
+// resolve maps a model-supplied relative path to an absolute path inside the
+// sandbox, rejecting anything that escapes root. An empty path means root.
+func (r *repoFS) resolve(rel string) (string, error) {
+	rel = strings.TrimSpace(rel)
+	rel = strings.TrimPrefix(rel, "./")
+	if rel == "" || rel == "." {
+		return r.root, nil
+	}
+	if filepath.IsAbs(rel) {
+		// Allow an absolute path only if it already points inside the sandbox.
+		clean := filepath.Clean(rel)
+		if err := r.contains(clean); err != nil {
+			return "", err
+		}
+		return clean, nil
+	}
+	joined := filepath.Clean(filepath.Join(r.root, rel))
+	if err := r.contains(joined); err != nil {
+		return "", err
+	}
+	return joined, nil
+}
+
+// contains verifies abs is root or lives beneath it.
+func (r *repoFS) contains(abs string) error {
+	if abs == r.root {
+		return nil
+	}
+	if !strings.HasPrefix(abs, r.root+string(os.PathSeparator)) {
+		return fmt.Errorf("path escapes the repository sandbox")
+	}
+	return nil
+}
+
+// toolbox builds the read-only review toolbox over this sandbox.
+func (r *repoFS) toolbox() (*llm.Toolbox, error) {
+	box := llm.NewToolbox("gadfly")
+	tools := []llm.Tool{
+		r.readFileTool(),
+		r.listDirTool(),
+		r.grepTool(),
+		r.findFilesTool(),
+		r.getDiffTool(),
+	}
+	for _, t := range tools {
+		if err := box.Add(t); err != nil {
+			return nil, fmt.Errorf("add tool %q: %w", t.Name, err)
+		}
+	}
+	return box, nil
+}
+
+type readFileArgs struct {
+	Path      string `json:"path" description:"Repository-relative path of the file to read, e.g. pkg/logic/agentexec/pipeline.go"`
+	StartLine int    `json:"start_line,omitempty" description:"Optional 1-based line to start from (default 1)."`
+	Limit     int    `json:"limit,omitempty" description:"Optional max number of lines to return (default/maximum 800)."`
+}
+
+func (r *repoFS) readFileTool() llm.Tool {
+	return llm.DefineTool[readFileArgs](
+		"read_file",
+		"Read a file from the repository at its current checked-out state, with line numbers. Use this to verify the surrounding code, imports, and symbols a diff hunk touches before reporting an issue.",
+		func(_ context.Context, args readFileArgs) (any, error) {
+			abs, err := r.resolve(args.Path)
+			if err != nil {
+				return nil, err
+			}
+			info, err := os.Stat(abs)
+			if err != nil {
+				return nil, fmt.Errorf("stat %q: %w", args.Path, err)
+			}
+			if info.IsDir() {
+				return nil, fmt.Errorf("%q is a directory; use list_dir", args.Path)
+			}
+			f, err := os.Open(abs)
+			if err != nil {
+				return nil, fmt.Errorf("open %q: %w", args.Path, err)
+			}
+			defer f.Close()
+
+			start := args.StartLine
+			if start < 1 {
+				start = 1
+			}
+			limit := args.Limit
+			if limit <= 0 || limit > maxReadLines {
+				limit = maxReadLines
+			}
+
+			var b strings.Builder
+			sc := bufio.NewScanner(f)
+			sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
+			lineNo := 0
+			emitted := 0
+			for sc.Scan() {
+				lineNo++
+				if lineNo < start {
+					continue
+				}
+				if emitted >= limit || b.Len() >= maxFileBytes {
+					fmt.Fprintf(&b, "... (truncated at line %d; call read_file again with start_line=%d for more)\n", lineNo, lineNo)
+					break
+				}
+				line := sc.Text()
+				if len(line) > maxLineLen {
+					line = line[:maxLineLen] + "…"
+				}
+				fmt.Fprintf(&b, "%d\t%s\n", lineNo, line)
+				emitted++
+			}
+			if err := sc.Err(); err != nil {
+				return nil, fmt.Errorf("read %q: %w", args.Path, err)
+			}
+			if emitted == 0 {
+				return fmt.Sprintf("(%s has no lines at/after %d; file has %d lines)", args.Path, start, lineNo), nil
+			}
+			return b.String(), nil
+		},
+	)
+}
+
+type listDirArgs struct {
+	Path string `json:"path,omitempty" description:"Optional repository-relative directory (default: repo root)."`
+}
+
+func (r *repoFS) listDirTool() llm.Tool {
+	return llm.DefineTool[listDirArgs](
+		"list_dir",
+		"List the entries of a directory in the repository (directories marked with a trailing /). Use it to discover where code lives before reading.",
+		func(_ context.Context, args listDirArgs) (any, error) {
+			abs, err := r.resolve(args.Path)
+			if err != nil {
+				return nil, err
+			}
+			entries, err := os.ReadDir(abs)
+			if err != nil {
+				return nil, fmt.Errorf("list %q: %w", args.Path, err)
+			}
+			names := make([]string, 0, len(entries))
+			for _, e := range entries {
+				name := e.Name()
+				if e.IsDir() {
+					name += "/"
+				}
+				names = append(names, name)
+			}
+			sort.Strings(names)
+			if len(names) == 0 {
+				return "(empty directory)", nil
+			}
+			return strings.Join(names, "\n"), nil
+		},
+	)
+}
+
+type grepArgs struct {
+	Pattern    string `json:"pattern" description:"A Go (RE2) regular expression to search for."`
+	Path       string `json:"path,omitempty" description:"Optional repository-relative file or subdirectory to scope the search (default: whole repo)."`
+	MaxResults int    `json:"max_results,omitempty" description:"Optional cap on matching lines returned (default/maximum 200)."`
+}
+
+func (r *repoFS) grepTool() llm.Tool {
+	return llm.DefineTool[grepArgs](
+		"grep",
+		"Search the repository's text files for a regular expression and return matching `path:line: text`. Use it to check whether a symbol, import, or call exists elsewhere before claiming a cross-file problem.",
+		func(_ context.Context, args grepArgs) (any, error) {
+			if strings.TrimSpace(args.Pattern) == "" {
+				return nil, fmt.Errorf("pattern is required")
+			}
+			re, err := regexp.Compile(args.Pattern)
+			if err != nil {
+				return nil, fmt.Errorf("invalid regexp: %w", err)
+			}
+			base, err := r.resolve(args.Path)
+			if err != nil {
+				return nil, err
+			}
+			limit := args.MaxResults
+			if limit <= 0 || limit > maxGrepResults {
+				limit = maxGrepResults
+			}
+
+			var out []string
+			truncated := false
+			walkErr := filepath.WalkDir(base, func(path string, d os.DirEntry, err error) error {
+				if err != nil {
+					return nil // skip unreadable entries
+				}
+				if d.IsDir() {
+					if skipDirs[d.Name()] && path != base {
+						return filepath.SkipDir
+					}
+					return nil
+				}
+				if len(out) >= limit {
+					truncated = true
+					return filepath.SkipAll
+				}
+				matchesInFile(path, r.root, re, limit, &out)
+				return nil
+			})
+			if walkErr != nil {
+				return nil, fmt.Errorf("search: %w", walkErr)
+			}
+			if len(out) > limit {
+				out = out[:limit]
+				truncated = true
+			}
+			if len(out) == 0 {
+				return "(no matches)", nil
+			}
+			res := strings.Join(out, "\n")
+			if truncated {
+				res += fmt.Sprintf("\n... (truncated at %d matches; narrow the pattern or path)", limit)
+			}
+			return res, nil
+		},
+	)
+}
+
+// matchesInFile appends "relpath:line: text" for each regexp match in a single
+// text file, stopping once the global cap is reached. Binary files (NUL in the
+// first chunk) and oversized files are skipped.
+func matchesInFile(path, root string, re *regexp.Regexp, limit int, out *[]string) {
+	f, err := os.Open(path)
+	if err != nil {
+		return
+	}
+	defer f.Close()
+
+	rel, relErr := filepath.Rel(root, path)
+	if relErr != nil {
+		rel = path
+	}
+	sc := bufio.NewScanner(f)
+	sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
+	lineNo := 0
+	for sc.Scan() {
+		if len(*out) >= limit {
+			return
+		}
+		lineNo++
+		line := sc.Text()
+		if lineNo == 1 && strings.IndexByte(line, 0) >= 0 {
+			return // looks binary
+		}
+		if re.MatchString(line) {
+			trimmed := strings.TrimSpace(line)
+			if len(trimmed) > maxLineLen {
+				trimmed = trimmed[:maxLineLen] + "…"
+			}
+			*out = append(*out, fmt.Sprintf("%s:%d: %s", rel, lineNo, trimmed))
+		}
+	}
+}
+
+type findFilesArgs struct {
+	Name       string `json:"name" description:"Case-insensitive substring of the file path to match, e.g. \"pipeline.go\" or \"agentexec/\"."`
+	MaxResults int    `json:"max_results,omitempty" description:"Optional cap on paths returned (default/maximum 200)."`
+}
+
+func (r *repoFS) findFilesTool() llm.Tool {
+	return llm.DefineTool[findFilesArgs](
+		"find_files",
+		"Find files whose repository-relative path contains a case-insensitive substring. Use it to locate a file by name when you don't know its directory.",
+		func(_ context.Context, args findFilesArgs) (any, error) {
+			needle := strings.ToLower(strings.TrimSpace(args.Name))
+			if needle == "" {
+				return nil, fmt.Errorf("name is required")
+			}
+			limit := args.MaxResults
+			if limit <= 0 || limit > maxFindResults {
+				limit = maxFindResults
+			}
+			var out []string
+			truncated := false
+			_ = filepath.WalkDir(r.root, func(path string, d os.DirEntry, err error) error {
+				if err != nil {
+					return nil
+				}
+				if d.IsDir() {
+					if skipDirs[d.Name()] && path != r.root {
+						return filepath.SkipDir
+					}
+					return nil
+				}
+				if len(out) >= limit {
+					truncated = true
+					return filepath.SkipAll
+				}
+				rel, relErr := filepath.Rel(r.root, path)
+				if relErr != nil {
+					return nil
+				}
+				if strings.Contains(strings.ToLower(rel), needle) {
+					out = append(out, rel)
+				}
+				return nil
+			})
+			sort.Strings(out)
+			if len(out) == 0 {
+				return "(no files matched)", nil
+			}
+			res := strings.Join(out, "\n")
+			if truncated {
+				res += fmt.Sprintf("\n... (truncated at %d files; narrow the name)", limit)
+			}
+			return res, nil
+		},
+	)
+}
+
+func (r *repoFS) getDiffTool() llm.Tool {
+	return llm.DefineTool[struct{}](
+		"get_diff",
+		"Return the complete unified diff under review. The diff is also included (possibly truncated) in the task message; call this to get the full, untruncated text.",
+		func(_ context.Context, _ struct{}) (any, error) {
+			if strings.TrimSpace(r.diff) == "" {
+				return "(empty diff)", nil
+			}
+			return r.diff, nil
+		},
+	)
+}
@@ -0,0 +1,243 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// buildFixtureRepo lays down a small repo tree for the toolbox tests and
+// returns its root.
+func buildFixtureRepo(t *testing.T) string {
+	t.Helper()
+	root := t.TempDir()
+	write := func(rel, content string) {
+		p := filepath.Join(root, rel)
+		if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(p, []byte(content), 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+	write("pkg/foo/foo.go", "package foo\n\nfunc Hello() string {\n\treturn \"hi\"\n}\n")
+	write("pkg/foo/bar.go", "package foo\n\n// TODO: refactor\nvar Answer = 42\n")
+	write("README.md", "# Fixture\n\nHello world.\n")
+	write(".git/config", "[core]\n\tbare = false\n") // must be skipped by grep/find
+	write("secret.txt", "this file lives at the repo root\n")
+	return root
+}
+
+// call invokes a tool from the sandbox's toolbox by name with JSON args and
+// returns the result string (or the error).
+func call(t *testing.T, fs *repoFS, name string, args map[string]any) (string, error) {
+	t.Helper()
+	box, err := fs.toolbox()
+	if err != nil {
+		t.Fatalf("toolbox: %v", err)
+	}
+	tool, ok := box.Get(name)
+	if !ok {
+		t.Fatalf("tool %q not in toolbox", name)
+	}
+	raw, err := json.Marshal(args)
+	if err != nil {
+		t.Fatal(err)
+	}
+	out, herr := tool.Handler(context.Background(), raw)
+	if herr != nil {
+		return "", herr
+	}
+	s, _ := out.(string)
+	return s, nil
+}
+
+func TestRepoFS_ResolveSandbox(t *testing.T) {
+	root := buildFixtureRepo(t)
+	fs, err := newRepoFS(root, "")
+	if err != nil {
+		t.Fatalf("newRepoFS: %v", err)
+	}
+
+	// In-bounds paths resolve.
+	if _, err := fs.resolve("pkg/foo/foo.go"); err != nil {
+		t.Errorf("in-bounds path rejected: %v", err)
+	}
+	if got, err := fs.resolve(""); err != nil || got != fs.root {
+		t.Errorf("empty path should be root: got %q err %v", got, err)
+	}
+
+	// Escapes are rejected.
+	for _, bad := range []string{"../outside", "../../etc/passwd", "pkg/../../escape", "/etc/passwd"} {
+		if _, err := fs.resolve(bad); err == nil {
+			t.Errorf("path %q escaped the sandbox but was allowed", bad)
+		}
+	}
+}
+
+func TestReadFileTool(t *testing.T) {
+	root := buildFixtureRepo(t)
+	fs, _ := newRepoFS(root, "")
+
+	out, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go"})
+	if err != nil {
+		t.Fatalf("read_file: %v", err)
+	}
+	if !strings.Contains(out, "func Hello()") {
+		t.Errorf("expected file body, got:\n%s", out)
+	}
+	if !strings.Contains(out, "1\t") {
+		t.Errorf("expected line numbers, got:\n%s", out)
+	}
+
+	// Line slicing.
+	out, err = call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go", "start_line": 3, "limit": 1})
+	if err != nil {
+		t.Fatalf("read_file slice: %v", err)
+	}
+	if !strings.Contains(out, "func Hello()") || strings.Contains(out, "package foo") {
+		t.Errorf("slice should start at line 3 only, got:\n%s", out)
+	}
+
+	// Reading a directory is an error directing to list_dir.
+	if _, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo"}); err == nil {
+		t.Error("reading a directory should error")
+	}
+
+	// Escape is rejected.
+	if _, err := call(t, fs, "read_file", map[string]any{"path": "../escape"}); err == nil {
+		t.Error("read_file should reject sandbox escape")
+	}
+}
+
+func TestListDirTool(t *testing.T) {
+	root := buildFixtureRepo(t)
+	fs, _ := newRepoFS(root, "")
+
+	out, err := call(t, fs, "list_dir", map[string]any{"path": "pkg/foo"})
+	if err != nil {
+		t.Fatalf("list_dir: %v", err)
+	}
+	for _, want := range []string{"foo.go", "bar.go"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("list_dir missing %q in:\n%s", want, out)
+		}
+	}
+
+	// Root listing marks directories with a trailing slash.
+	out, _ = call(t, fs, "list_dir", map[string]any{})
+	if !strings.Contains(out, "pkg/") {
+		t.Errorf("expected pkg/ (dir with trailing slash) in root listing:\n%s", out)
+	}
+}
+
+func TestGrepTool(t *testing.T) {
+	root := buildFixtureRepo(t)
+	fs, _ := newRepoFS(root, "")
+
+	out, err := call(t, fs, "grep", map[string]any{"pattern": "func Hello"})
+	if err != nil {
+		t.Fatalf("grep: %v", err)
+	}
+	if !strings.Contains(out, "pkg/foo/foo.go:") {
+		t.Errorf("grep should locate the func, got:\n%s", out)
+	}
+
+	// .git is skipped.
+	out, _ = call(t, fs, "grep", map[string]any{"pattern": "bare = false"})
+	if strings.Contains(out, ".git/") {
+		t.Errorf("grep must not descend into .git, got:\n%s", out)
+	}
+
+	// No matches is a clean message, not an error.
+	out, err = call(t, fs, "grep", map[string]any{"pattern": "zzz_no_such_token_zzz"})
+	if err != nil || !strings.Contains(out, "no matches") {
+		t.Errorf("expected clean no-match, got %q err %v", out, err)
+	}
+
+	// Invalid regexp surfaces as an error.
+	if _, err := call(t, fs, "grep", map[string]any{"pattern": "([unterminated"}); err == nil {
+		t.Error("invalid regexp should error")
+	}
+
+	// Scoped grep honors the path.
+	out, _ = call(t, fs, "grep", map[string]any{"pattern": "Answer", "path": "pkg/foo/bar.go"})
+	if !strings.Contains(out, "bar.go:") {
+		t.Errorf("scoped grep missed the match:\n%s", out)
+	}
+}
+
+func TestFindFilesTool(t *testing.T) {
+	root := buildFixtureRepo(t)
+	fs, _ := newRepoFS(root, "")
+
+	out, err := call(t, fs, "find_files", map[string]any{"name": "foo.go"})
+	if err != nil {
+		t.Fatalf("find_files: %v", err)
+	}
+	if !strings.Contains(out, "pkg/foo/foo.go") {
+		t.Errorf("find_files missed foo.go:\n%s", out)
+	}
+
+	// Case-insensitive substring on the path.
+	out, _ = call(t, fs, "find_files", map[string]any{"name": "PKG/FOO"})
+	if !strings.Contains(out, "pkg/foo/") {
+		t.Errorf("find_files should be case-insensitive on the path:\n%s", out)
+	}
+
+	// .git entries are not surfaced.
+	out, _ = call(t, fs, "find_files", map[string]any{"name": "config"})
+	if strings.Contains(out, ".git/") {
+		t.Errorf("find_files must skip .git, got:\n%s", out)
+	}
+}
+
+func TestGetDiffTool(t *testing.T) {
+	root := buildFixtureRepo(t)
+	const diff = "diff --git a/x b/x\n+added line\n"
+	fs, _ := newRepoFS(root, diff)
+
+	out, err := call(t, fs, "get_diff", map[string]any{})
+	if err != nil {
+		t.Fatalf("get_diff: %v", err)
+	}
+	if out != diff {
+		t.Errorf("get_diff returned %q, want %q", out, diff)
+	}
+}
+
+func TestNewRepoFS_BadRoot(t *testing.T) {
+	// A file (not a directory) is rejected.
+	f := filepath.Join(t.TempDir(), "afile")
+	if err := os.WriteFile(f, []byte("x"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := newRepoFS(f, ""); err == nil {
+		t.Error("newRepoFS should reject a non-directory root")
+	}
+	if _, err := newRepoFS(filepath.Join(t.TempDir(), "missing"), ""); err == nil {
+		t.Error("newRepoFS should reject a missing root")
+	}
+}
+
+// Ensure the toolbox exposes exactly the expected tools (guards against an
+// accidental rename breaking the system prompt's tool references).
+func TestToolbox_Names(t *testing.T) {
+	fs, _ := newRepoFS(t.TempDir(), "")
+	box, err := fs.toolbox()
+	if err != nil {
+		t.Fatalf("toolbox: %v", err)
+	}
+	got := map[string]bool{}
+	for _, tl := range box.Tools() {
+		got[tl.Name] = true
+	}
+	for _, want := range []string{"read_file", "list_dir", "grep", "find_files", "get_diff"} {
+		if !got[want] {
+			t.Errorf("toolbox missing tool %q", want)
+		}
+	}
+}
@@ -0,0 +1,143 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"testing"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake"
+)
+
+// spinToolCall is a response that asks for the get_diff tool (which succeeds and
+// ignores extra args), used to burn agent steps without producing a final
+// answer. The args vary by n so successive calls are not byte-identical — that
+// dodges the agent's same-call loop guard, exactly as a real reviewer making
+// distinct tool calls would.
+func spinToolCall(n int) fake.Step {
+	return fake.ReplyWith(llm.Response{
+		ToolCalls: []llm.ToolCall{{
+			ID:        "call",
+			Name:      "get_diff",
+			Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)),
+		}},
+		FinishReason: llm.FinishToolCalls,
+		Usage:        llm.Usage{InputTokens: 1, OutputTokens: 1},
+	})
+}
+
+// lastUserText returns the text of the final message in the request, which is
+// what a fresh Generate call is reacting to.
+func lastUserText(req llm.Request) string {
+	if len(req.Messages) == 0 {
+		return ""
+	}
+	return req.Messages[len(req.Messages)-1].Text()
+}
+
+// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
+// it is nudged to wrap up should still finish inside its budget — the steer
+// message arrives a few steps before the cap and the model writes its answer.
+func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
+	t.Setenv("GADFLY_WRAPUP_RESERVE", "4")
+
+	final := "VERDICT: No material issues found."
+	nudgeSeen := false
+	n := 0
+	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
+		if strings.Contains(lastUserText(req), "almost out of your investigation budget") {
+			nudgeSeen = true
+			return fake.Reply(final)
+		}
+		n++
+		return spinToolCall(n)
+	}))
+	mdl, err := p.Model("mock")
+	if err != nil {
+		t.Fatal(err)
+	}
+	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
+
+	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
+	if err != nil {
+		t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
+	}
+	if out != final {
+		t.Errorf("expected final review %q, got %q", final, out)
+	}
+	if !nudgeSeen {
+		t.Error("the wrap-up nudge was never delivered to the model")
+	}
+}
+
+// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
+// spins on tools until the cap should NOT hard-fail — the tool-free finalization
+// pass forces a final answer out of the transcript.
+func TestRunAgent_FinalizationFallback(t *testing.T) {
+	t.Setenv("GADFLY_WRAPUP_RESERVE", "2")
+
+	final := "VERDICT: Minor issues\n- something"
+	forcedCalled := false
+	n := 0
+	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
+		// Only the tool-free finalization pass forbids tools — reply there.
+		if req.ToolChoice == "none" {
+			forcedCalled = true
+			return fake.Reply(final)
+		}
+		// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
+		n++
+		return spinToolCall(n)
+	}))
+	mdl, err := p.Model("mock")
+	if err != nil {
+		t.Fatal(err)
+	}
+	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
+
+	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
+	if err != nil {
+		t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
+	}
+	if !forcedCalled {
+		t.Error("finalization fallback was never invoked")
+	}
+	if out != final {
+		t.Errorf("expected forced final answer %q, got %q", final, out)
+	}
+}
+
+// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
+// yields nothing, runAgent surfaces an error rather than a phantom success.
+func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
+	n := 0
+	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
+		if req.ToolChoice == "none" {
+			return fake.Reply("   ") // finalization produces only whitespace
+		}
+		n++
+		return spinToolCall(n)
+	}))
+	mdl, err := p.Model("mock")
+	if err != nil {
+		t.Fatal(err)
+	}
+	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
+
+	if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
+		t.Error("runAgent should error when the finalization fallback also yields no output")
+	}
+}
+
+func TestWrapUpReserve(t *testing.T) {
+	t.Setenv("GADFLY_WRAPUP_RESERVE", "")
+	if got := wrapUpReserve(); got != defaultWrapUpReserve {
+		t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve)
+	}
+	t.Setenv("GADFLY_WRAPUP_RESERVE", "7")
+	if got := wrapUpReserve(); got != 7 {
+		t.Errorf("wrap-up reserve override = %d, want 7", got)
+	}
+}