feat: claude-code reviewer engine (#2)

Phase 1: a second review engine alongside the majordomo agent loop. For each lens, shell out to the Claude Code CLI (`claude -p --output-format json`) inside the checked-out repo so it verifies findings with its own read tools, then reuse gadfly's verdict-parse + recheck + consolidate + emit pipeline. Select via GADFLY_MODELS `claude-code`/`claude-code/<model>`; auth via CLAUDE_CODE_OAUTH_TOKEN (no --bare) else ANTHROPIC_API_KEY; read-only by default; GADFLY_CLAUDE_* knobs. Dockerfile bundles Node + @anthropic-ai/claude-code. Also bumped the dogfood pin to the status-board image (PR #2 was the first dogfood with the live board + full fleet). Folded in the swarm's own review findings: minimal subprocess env (no GITEA_TOKEN leak to the CLI), runPass robustness (ctx/empty-result/runErr), process-group cleanup on timeout, rune-safe error truncation, and engine-neutral prompts (also de-mort-ified the recheck prompt). 66 findings graded via the gadfly MCP. gofmt clean, go vet quiet, go build + go test -race green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Co-authored-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com> Co-committed-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
2026-06-27 20:40:41 +00:00
parent c3d09d3bd4
commit 86f12c126f
13 changed files with 635 additions and 44 deletions
@@ -0,0 +1,227 @@
+package main
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"syscall"
+	"unicode/utf8"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// reviewEngine runs a single agent pass against the checked-out repo and returns
+// the model's text answer. It is the one primitive both review passes use — the
+// draft review and the adversarial recheck — so the rest of the pipeline
+// (specialist composition, recheck orchestration, consolidation, emit) is
+// engine-agnostic. Two implementations:
+//
+//   - majordomoEngine: the original path — a majordomo tool-using agent loop
+//     (read_file/grep/… over a sandboxed repoFS).
+//   - claudeCodeEngine: shells out to the `claude` CLI in print mode, which
+//     brings its OWN repo tools; gadfly just feeds it the prompt and reads back
+//     the final text.
+//
+// maxSteps is the tool-step budget for engines that have one (majordomo); the
+// claude-code engine manages its own loop and ignores it.
+type reviewEngine interface {
+	runPass(ctx context.Context, system, task string, maxSteps int) (string, error)
+}
+
+// majordomoEngine drives the in-process majordomo agent over the repo sandbox.
+type majordomoEngine struct {
+	mdl     llm.Model
+	fsTools *repoFS
+}
+
+func (e *majordomoEngine) runPass(ctx context.Context, system, task string, maxSteps int) (string, error) {
+	return runAgent(ctx, e.mdl, e.fsTools, system, task, maxSteps)
+}
+
+// claudeCodeEngine reviews by shelling out to the `claude` CLI (Claude Code) in
+// non-interactive print mode. Claude Code reads the checked-out tree with its
+// own read tools (so it verifies findings against real code, like the agentic
+// majordomo path), and we parse its final answer out of `--output-format json`.
+//
+// Auth is inherited from the environment: the default backend is a Pro/Max
+// subscription via CLAUDE_CODE_OAUTH_TOKEN (no `--bare`). See README.
+type claudeCodeEngine struct {
+	bin            string   // CLI binary (GADFLY_CLAUDE_BIN, default "claude")
+	model          string   // --model value ("" = CLI default)
+	repoDir        string   // cwd for the CLI, so its tools read the checked-out tree
+	permissionMode string   // --permission-mode (default "plan": read-only, no edits)
+	allowedTools   string   // --allowedTools value, passed verbatim ("" = omit)
+	extraArgs      []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS)
+}
+
+// isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code
+// engine: the bare id "claude-code" or a "claude-code/<model>" form.
+func isClaudeCodeSpec(model string) bool {
+	m := strings.TrimSpace(model)
+	return m == "claude-code" || strings.HasPrefix(m, "claude-code/")
+}
+
+// newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the
+// optional GADFLY_CLAUDE_* overrides. The model after the slash in
+// "claude-code/<model>" becomes --model (e.g. "claude-code/sonnet" → "sonnet");
+// GADFLY_CLAUDE_MODEL overrides it. It does not verify the CLI is installed —
+// a missing binary surfaces as a normal pass error (advisory, never fatal).
+func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine {
+	model := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL"))
+	if model == "" {
+		if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok {
+			model = strings.TrimSpace(after)
+		}
+	}
+	return &claudeCodeEngine{
+		bin:            envOr("GADFLY_CLAUDE_BIN", "claude"),
+		model:          model,
+		repoDir:        repoDir,
+		permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"),
+		allowedTools:   strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")),
+		extraArgs:      strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")),
+	}
+}
+
+// args assembles the `claude` argv for one pass. Factored out (and pure) so it
+// can be unit-tested without invoking the CLI. The system prompt is layered on
+// top of Claude Code's own via --append-system-prompt; the task is the -p
+// prompt.
+func (e *claudeCodeEngine) args(system, task string) []string {
+	a := []string{"-p", task, "--output-format", "json", "--append-system-prompt", system}
+	if e.model != "" {
+		a = append(a, "--model", e.model)
+	}
+	if e.permissionMode != "" {
+		a = append(a, "--permission-mode", e.permissionMode)
+	}
+	if e.allowedTools != "" {
+		a = append(a, "--allowedTools", e.allowedTools)
+	}
+	return append(a, e.extraArgs...)
+}
+
+// claudeResult is the subset of `claude --output-format json` we read.
+type claudeResult struct {
+	Result  string `json:"result"`
+	IsError bool   `json:"is_error"`
+	Subtype string `json:"subtype"`
+}
+
+func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ int) (string, error) {
+	cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...)
+	cmd.Dir = e.repoDir
+	cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI
+	// Put the CLI and the Node children it spawns in their own process group and
+	// kill the WHOLE group on context cancel, so a timed-out lens can't leave
+	// orphaned claude/node processes behind in the container.
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	runErr := cmd.Run()
+
+	// A cancelled/timed-out run must surface as an error, never as whatever
+	// partial bytes the CLI flushed before it was killed.
+	if ctx.Err() != nil {
+		return "", fmt.Errorf("claude -p %v", ctx.Err())
+	}
+
+	var res claudeResult
+	parsed := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &res) == nil
+
+	// Clean exit: trust the parsed JSON answer, and ONLY it — never fall back to
+	// the raw JSON envelope when the result is empty.
+	if runErr == nil && parsed {
+		if res.IsError {
+			return "", fmt.Errorf("claude reported error (%s): %s", res.Subtype, truncateForErr(res.Result))
+		}
+		if out := strings.TrimSpace(res.Result); out != "" {
+			return out, nil
+		}
+		return "", fmt.Errorf("claude -p returned an empty result")
+	}
+
+	if runErr != nil {
+		// Prefer the CLI's own structured error message when it gave one.
+		if parsed && res.IsError && strings.TrimSpace(res.Result) != "" {
+			return "", fmt.Errorf("claude reported error (%s): %s", res.Subtype, truncateForErr(res.Result))
+		}
+		detail := truncateForErr(stderr.String())
+		if detail == "" {
+			detail = truncateForErr(stdout.String())
+		}
+		if detail != "" {
+			return "", fmt.Errorf("claude -p failed: %v: %s", runErr, detail)
+		}
+		return "", fmt.Errorf("claude -p failed: %v", runErr)
+	}
+
+	// Clean exit but stdout wasn't the expected JSON envelope: degrade to the raw
+	// text so a CLI format change still yields a review instead of nothing.
+	if raw := strings.TrimSpace(stdout.String()); raw != "" {
+		return raw, nil
+	}
+	return "", fmt.Errorf("claude -p produced no parseable output")
+}
+
+// claudeEnv builds a minimal environment for the `claude` subprocess: only what
+// the CLI needs (PATH/HOME, its auth tokens, locale, Node/XDG/GADFLY_CLAUDE_*
+// knobs), deliberately dropping the rest of the runner's secrets — GITEA_TOKEN,
+// GADFLY_FINDINGS_TOKEN, provider keys — so they never reach the third-party
+// CLI. Defense in depth: the parent already holds them, but the CLI has no need.
+func claudeEnv() []string {
+	keep := func(k string) bool {
+		switch k {
+		case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL":
+			return true
+		}
+		return strings.HasPrefix(k, "LC_") ||
+			strings.HasPrefix(k, "CLAUDE_") ||
+			strings.HasPrefix(k, "ANTHROPIC_") ||
+			strings.HasPrefix(k, "GADFLY_CLAUDE_") ||
+			strings.HasPrefix(k, "NODE_") ||
+			strings.HasPrefix(k, "XDG_")
+	}
+	var env []string
+	for _, kv := range os.Environ() {
+		if k, _, ok := strings.Cut(kv, "="); ok && keep(k) {
+			env = append(env, kv)
+		}
+	}
+	return env
+}
+
+// truncateForErr caps CLI error detail so a stderr dump can't bloat the comment,
+// cutting on a rune boundary so it never emits invalid UTF-8.
+func truncateForErr(s string) string {
+	s = strings.TrimSpace(s)
+	const max = 800
+	if len(s) <= max {
+		return s
+	}
+	cut := max
+	for cut > 0 && !utf8.RuneStart(s[cut]) {
+		cut--
+	}
+	return s[:cut] + "…"
+}
+
+// envOr returns the env var value or a default when unset/blank.
+func envOr(name, def string) string {
+	if v := strings.TrimSpace(os.Getenv(name)); v != "" {
+		return v
+	}
+	return def
+}
@@ -0,0 +1,219 @@
+package main
+
+import (
+	"context"
+	"os"
+	"slices"
+	"strings"
+	"testing"
+	"unicode/utf8"
+)
+
+func TestIsClaudeCodeSpec(t *testing.T) {
+	cases := map[string]bool{
+		"claude-code":                 true,
+		"claude-code/sonnet":          true,
+		"claude-code/opus":            true,
+		"claude-code/claude-opus-4-8": true,
+		"  claude-code  ":             true, // trimmed
+		"qwen3-coder:480b-cloud":      false,
+		"claude-code-extra":           false, // not the bare id, not a "/" form
+		"sonnet":                      false,
+		"":                            false,
+	}
+	for spec, want := range cases {
+		if got := isClaudeCodeSpec(spec); got != want {
+			t.Errorf("isClaudeCodeSpec(%q) = %v, want %v", spec, got, want)
+		}
+	}
+}
+
+func TestNewClaudeCodeEngineModel(t *testing.T) {
+	// model derived from the spec's "/<model>" suffix
+	t.Setenv("GADFLY_CLAUDE_MODEL", "")
+	if e := newClaudeCodeEngine("claude-code/sonnet", "/repo"); e.model != "sonnet" {
+		t.Errorf("model = %q, want sonnet", e.model)
+	}
+	// bare spec → CLI default (no --model)
+	if e := newClaudeCodeEngine("claude-code", "/repo"); e.model != "" {
+		t.Errorf("model = %q, want empty for bare spec", e.model)
+	}
+	// GADFLY_CLAUDE_MODEL overrides the spec suffix
+	t.Setenv("GADFLY_CLAUDE_MODEL", "opus")
+	if e := newClaudeCodeEngine("claude-code/sonnet", "/repo"); e.model != "opus" {
+		t.Errorf("model = %q, want opus (env override)", e.model)
+	}
+}
+
+func TestClaudeCodeEngineDefaults(t *testing.T) {
+	t.Setenv("GADFLY_CLAUDE_BIN", "")
+	t.Setenv("GADFLY_CLAUDE_PERMISSION_MODE", "")
+	t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "")
+	t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "")
+	e := newClaudeCodeEngine("claude-code", "/repo")
+	if e.bin != "claude" {
+		t.Errorf("bin = %q, want claude", e.bin)
+	}
+	if e.permissionMode != "plan" {
+		t.Errorf("permissionMode = %q, want plan", e.permissionMode)
+	}
+	if e.repoDir != "/repo" {
+		t.Errorf("repoDir = %q, want /repo", e.repoDir)
+	}
+}
+
+// argAfter returns the value following flag in args, or "" if absent.
+func argAfter(args []string, flag string) string {
+	if i := slices.Index(args, flag); i >= 0 && i+1 < len(args) {
+		return args[i+1]
+	}
+	return ""
+}
+
+func TestClaudeCodeArgs(t *testing.T) {
+	t.Setenv("GADFLY_CLAUDE_MODEL", "")
+	t.Setenv("GADFLY_CLAUDE_PERMISSION_MODE", "")
+	t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "Read,Grep,Glob")
+	t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "--max-turns 30")
+	e := newClaudeCodeEngine("claude-code/sonnet", "/repo")
+	args := e.args("SYS-PROMPT", "TASK-PROMPT")
+
+	// task is the -p value; json output; system appended; model + policy present.
+	if argAfter(args, "-p") != "TASK-PROMPT" {
+		t.Errorf("-p = %q, want TASK-PROMPT", argAfter(args, "-p"))
+	}
+	if argAfter(args, "--output-format") != "json" {
+		t.Errorf("--output-format = %q, want json", argAfter(args, "--output-format"))
+	}
+	if argAfter(args, "--append-system-prompt") != "SYS-PROMPT" {
+		t.Errorf("--append-system-prompt = %q, want SYS-PROMPT", argAfter(args, "--append-system-prompt"))
+	}
+	if argAfter(args, "--model") != "sonnet" {
+		t.Errorf("--model = %q, want sonnet", argAfter(args, "--model"))
+	}
+	if argAfter(args, "--permission-mode") != "plan" {
+		t.Errorf("--permission-mode = %q, want plan", argAfter(args, "--permission-mode"))
+	}
+	if argAfter(args, "--allowedTools") != "Read,Grep,Glob" {
+		t.Errorf("--allowedTools = %q, want Read,Grep,Glob", argAfter(args, "--allowedTools"))
+	}
+	// extra args appended verbatim (split on whitespace)
+	if !strings.Contains(strings.Join(args, " "), "--max-turns 30") {
+		t.Errorf("extra args not appended: %v", args)
+	}
+}
+
+func TestClaudeCodeArgsBareModelOmitsFlag(t *testing.T) {
+	t.Setenv("GADFLY_CLAUDE_MODEL", "")
+	t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "") // omit when blank
+	t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "")
+	e := newClaudeCodeEngine("claude-code", "/repo")
+	args := e.args("s", "t")
+	if slices.Contains(args, "--model") {
+		t.Errorf("--model should be omitted for a bare claude-code spec: %v", args)
+	}
+	if slices.Contains(args, "--allowedTools") {
+		t.Errorf("--allowedTools should be omitted when blank: %v", args)
+	}
+}
+
+func TestClaudeEnvFilters(t *testing.T) {
+	t.Setenv("GITEA_TOKEN", "secret-gitea")
+	t.Setenv("OLLAMA_API_KEY", "secret-ollama")
+	t.Setenv("GADFLY_API_KEY", "secret-gadfly")
+	t.Setenv("GADFLY_FINDINGS_TOKEN", "secret-findings")
+	t.Setenv("CLAUDE_CODE_OAUTH_TOKEN", "keep-claude")
+	t.Setenv("ANTHROPIC_API_KEY", "keep-anthropic")
+	t.Setenv("GADFLY_CLAUDE_MODEL", "keep-knob")
+
+	env := claudeEnv()
+	has := func(k string) bool {
+		for _, kv := range env {
+			if strings.HasPrefix(kv, k+"=") {
+				return true
+			}
+		}
+		return false
+	}
+	// kept: the CLI's auth + its own knobs + PATH
+	for _, k := range []string{"CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY", "GADFLY_CLAUDE_MODEL", "PATH"} {
+		if !has(k) {
+			t.Errorf("claudeEnv dropped %s, but it should be kept", k)
+		}
+	}
+	// dropped: the runner's secrets the CLI doesn't need
+	for _, k := range []string{"GITEA_TOKEN", "OLLAMA_API_KEY", "GADFLY_API_KEY", "GADFLY_FINDINGS_TOKEN"} {
+		if has(k) {
+			t.Errorf("claudeEnv leaked %s into the subprocess env", k)
+		}
+	}
+}
+
+func TestTruncateForErrRuneSafe(t *testing.T) {
+	// 900 multibyte runes (3 bytes each) -> well over the 800-byte cap; the cut
+	// must land on a rune boundary so the result stays valid UTF-8.
+	s := strings.Repeat("€", 900)
+	got := truncateForErr(s)
+	if !utf8.ValidString(got) {
+		t.Fatalf("truncateForErr produced invalid UTF-8")
+	}
+	if !strings.HasSuffix(got, "…") {
+		t.Fatalf("truncateForErr should append an ellipsis when truncating")
+	}
+	// short strings pass through untouched
+	if truncateForErr("  hi  ") != "hi" {
+		t.Fatalf("truncateForErr should trim and pass short strings through")
+	}
+}
+
+// stubClaude writes an executable shell stub that prints body and exits code,
+// and returns an engine pointed at it.
+func stubClaude(t *testing.T, body string, code int) *claudeCodeEngine {
+	t.Helper()
+	dir := t.TempDir()
+	path := dir + "/claude-stub.sh"
+	script := "#!/bin/sh\nprintf '%s' " + shSingleQuote(body) + "\nexit " + itoa(code) + "\n"
+	if err := os.WriteFile(path, []byte(script), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	return &claudeCodeEngine{bin: path, repoDir: dir}
+}
+
+func shSingleQuote(s string) string { return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" }
+func itoa(i int) string             { return string(rune('0' + i)) } // single-digit exit codes only
+
+func TestRunPassCleanResult(t *testing.T) {
+	e := stubClaude(t, `{"result":"REVIEW TEXT","is_error":false}`, 0)
+	out, err := e.runPass(context.Background(), "sys", "task", 0)
+	if err != nil || out != "REVIEW TEXT" {
+		t.Fatalf("clean result: got (%q, %v), want (REVIEW TEXT, nil)", out, err)
+	}
+}
+
+func TestRunPassEmptyResultIsError(t *testing.T) {
+	// JSON parses, exit 0, but result empty: must NOT return the raw JSON blob.
+	e := stubClaude(t, `{"result":"","is_error":false}`, 0)
+	out, err := e.runPass(context.Background(), "sys", "task", 0)
+	if err == nil {
+		t.Fatalf("empty result should be an error, got out=%q", out)
+	}
+	if strings.Contains(out, "{") {
+		t.Fatalf("empty result must not leak raw JSON, got %q", out)
+	}
+}
+
+func TestRunPassIsErrorFlag(t *testing.T) {
+	e := stubClaude(t, `{"result":"boom","is_error":true,"subtype":"error_max_turns"}`, 0)
+	_, err := e.runPass(context.Background(), "sys", "task", 0)
+	if err == nil || !strings.Contains(err.Error(), "claude reported error") {
+		t.Fatalf("is_error should surface as an error, got %v", err)
+	}
+}
+
+func TestRunPassNonZeroNoJSON(t *testing.T) {
+	e := stubClaude(t, "fatal: auth failed", 1)
+	_, err := e.runPass(context.Background(), "sys", "task", 0)
+	if err == nil || !strings.Contains(err.Error(), "claude -p failed") {
+		t.Fatalf("non-zero exit should error with detail, got %v", err)
+	}
+}
@@ -104,7 +104,7 @@ func TestRunSpecialists_FansOut(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(mdl, fs, "sys", specs, "task", "diff")
+	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")

 	if got := peak(); got != 3 {
 		t.Errorf("peak concurrent lenses = %d, want 3", got)
@@ -124,7 +124,7 @@ func TestRunSpecialists_SequentialByDefault(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(mdl, fs, "sys", specs, "task", "diff")
+	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")

 	if got := peak(); got != 1 {
 		t.Errorf("peak concurrent lenses = %d, want 1 (sequential by default)", got)
@@ -146,7 +146,7 @@ func TestRunSpecialists_PerProviderFanOut(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(mdl, fs, "sys", specs, "task", "diff")
+	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")

 	if got := peak(); got != 3 {
 		t.Errorf("peak concurrent lenses = %d, want 3 (m1 per-provider override)", got)
@@ -149,17 +149,27 @@ func run() error {
 		return err
 	}

-	mdl, err := resolveModel()
-	if err != nil {
-		return fmt.Errorf("resolve model: %w", err)
-	}
-
-	// Optional cheap worker for delegate_investigation. Non-fatal: a bad worker
-	// spec just disables delegation rather than sinking the review.
-	if worker, werr := resolveWorkerModel(); werr != nil {
-		fmt.Fprintln(os.Stderr, "gadfly: worker model disabled:", werr)
-	} else if worker != nil {
-		fsTools.worker = worker
+	// Resolve the review engine. The claude-code engine shells out to the
+	// `claude` CLI (its own repo tools); every other spec is a majordomo model.
+	// auto-selection and the delegate worker are majordomo-only — with
+	// claude-code they're skipped (Claude Code does its own legwork).
+	ccSpec := isClaudeCodeSpec(os.Getenv("GADFLY_MODEL"))
+	var eng reviewEngine
+	if ccSpec {
+		eng = newClaudeCodeEngine(os.Getenv("GADFLY_MODEL"), fsTools.root)
+	} else {
+		mdl, merr := resolveModel()
+		if merr != nil {
+			return fmt.Errorf("resolve model: %w", merr)
+		}
+		// Optional cheap worker for delegate_investigation. Non-fatal: a bad
+		// worker spec just disables delegation rather than sinking the review.
+		if worker, werr := resolveWorkerModel(); werr != nil {
+			fmt.Fprintln(os.Stderr, "gadfly: worker model disabled:", werr)
+		} else if worker != nil {
+			fsTools.worker = worker
+		}
+		eng = &majordomoEngine{mdl: mdl, fsTools: fsTools}
 	}

 	specialists, registry, auto, serrs := resolveSpecialists(repoDir)
@@ -168,20 +178,26 @@ func run() error {
 	}

 	// Dynamic selection: a (cheap) model picks the lenses this diff needs.
+	// Majordomo-only — the selector is an llm.Model.
 	if auto {
-		selector, serr := resolveSelectorModel(mdl)
-		if serr != nil {
-			return fmt.Errorf("resolve selector model: %w", serr)
-		}
-		selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout)
-		picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry)
-		cancel()
-		if aerr != nil {
-			fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr)
+		if ccSpec {
+			fmt.Fprintln(os.Stderr, "gadfly: auto-select is not supported with the claude-code engine; using the default suite")
 			specialists = suiteFromRegistry(registry, defaultSuite)
 		} else {
-			specialists = picked
-			fmt.Fprintln(os.Stderr, "gadfly: auto-selected specialists:", specialistNamesOf(specialists))
+			selector, serr := resolveSelectorModel(eng.(*majordomoEngine).mdl)
+			if serr != nil {
+				return fmt.Errorf("resolve selector model: %w", serr)
+			}
+			selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout)
+			picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry)
+			cancel()
+			if aerr != nil {
+				fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr)
+				specialists = suiteFromRegistry(registry, defaultSuite)
+			} else {
+				specialists = picked
+				fmt.Fprintln(os.Stderr, "gadfly: auto-selected specialists:", specialistNamesOf(specialists))
+			}
 		}
 	}

@@ -191,7 +207,7 @@ func run() error {

 	base := string(systemBytes)
 	task := buildTask(diff)
-	results := runSpecialists(mdl, fsTools, base, specialists, task, diff)
+	results := runSpecialists(eng, base, specialists, task, diff)

 	fmt.Println(renderConsolidated(results))

@@ -215,7 +231,7 @@ func run() error {
 // per-provider model concurrency, so total concurrent backend requests ≈
 // (models at once) × (lenses at once). To fan lenses out without oversubscribing
 // the backend, run models one at a time (provider lane cap 1) and raise this.
-func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []Specialist, task, diff string) []specialistResult {
+func runSpecialists(eng reviewEngine, base string, specialists []Specialist, task, diff string) []specialistResult {
 	results := make([]specialistResult, len(specialists))

 	// Optional live status board: publishes this model's per-lens progress to a
@@ -244,7 +260,7 @@ func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []S
 				}
 			}()
 			sw.set(sp.Name, lensRunning, "", false)
-			out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff)
+			out, errored := reviewWithSpecialist(eng, base, sp, task, diff)
 			v := parseVerdict(out)
 			results[i] = specialistResult{spec: sp, out: out, verdict: v, errored: errored}
 			sw.set(sp.Name, lensFinished, v.label(), errored)
@@ -290,12 +306,12 @@ func providerOverride(envName, provider string) (int, bool) {
 // specialist's composed prompt, then the shared adversarial recheck pass. The
 // returned bool is true when the review pass failed (rendered as an inline
 // notice — advisory; one lens failing never sinks the others or the job).
-func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) (string, bool) {
+func reviewWithSpecialist(eng reviewEngine, base string, sp Specialist, task, diff string) (string, bool) {
 	timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()

-	draft, err := runAgent(ctx, mdl, fsTools, composeSpecialistPrompt(base, sp), task,
+	draft, err := eng.runPass(ctx, composeSpecialistPrompt(base, sp), task,
 		envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "gadfly: specialist %q review pass failed: %v\n", sp.Name, err)
@@ -304,7 +320,7 @@ func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specia

 	final := draft
 	if shouldRecheck(draft) {
-		rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff),
+		rechecked, rerr := eng.runPass(ctx, recheckSystemPrompt, buildRecheckTask(draft, diff),
 			envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps))
 		if rerr != nil {
 			fmt.Fprintf(os.Stderr, "gadfly: specialist %q recheck failed; emitting unverified draft: %v\n", sp.Name, rerr)
@@ -415,7 +431,7 @@ func buildTask(diff string) string {
 	truncNote := ""
 	if maxDiff > 0 && len(diff) > maxDiff {
 		diff = diff[:maxDiff]
-		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff)
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; read the changed files (or call get_diff, if available) for the full text.]", maxDiff)
 	}

 	var b strings.Builder
@@ -425,7 +441,7 @@ func buildTask(diff string) string {
 	if strings.TrimSpace(body) != "" {
 		fmt.Fprintf(&b, "PR description:\n%s\n\n", body)
 	}
-	b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n")
+	b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your repository read tools to verify it against the actual checked-out code — do not rely on the diff alone.\n\n")
 	fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote)
 	return b.String()
 }
@@ -16,15 +16,14 @@ const defaultRecheckMaxSteps = 16
 // against the real code before letting it survive — the antidote to a
 // single-pass reviewer that reads a couple of files, mis-connects them, and
 // posts a confident but wrong "blocking" verdict.
-const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the
-"mort" project (a large Go Discord bot). You are given a DRAFT review produced
-by another model. Your job is NOT to write a new review — it is to confirm or
-reject each finding in the draft against the ACTUAL code, then output the
-corrected review.
+const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review. You are
+given a DRAFT review produced by another model. Your job is NOT to write a new
+review — it is to confirm or reject each finding in the draft against the ACTUAL
+code, then output the corrected review.

-You have the same read-only repository tools as the original reviewer:
- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path,
-  max_results]), find_files(name[, max_results]), get_diff().
+You have read-only access to the checked-out repository — use your tools to read
+files and search the code to independently verify each finding against the real
+source.

 For EVERY finding in the draft:
 1. Independently reproduce the reasoning by reading the actual files with your
@@ -84,7 +83,7 @@ func buildRecheckTask(draft, diff string) string {
 	truncNote := ""
 	if maxDiff > 0 && len(diff) > maxDiff {
 		diff = diff[:maxDiff]
-		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff)
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; read the changed files (or call get_diff, if available) for the full text.]", maxDiff)
 	}

 	var b strings.Builder