feat: re-platform agentic review onto executus + large-PR cost controls

Fixes the large-PR token burn: a ~250K-token diff was re-sent every agent step across models × lenses × passes, draining a metered usage block in minutes. Small PRs are untouched (every mitigation is size-gated / no-op under threshold). - Re-platform the in-process review path onto executus run.Executor: context compaction (executus/compact, threshold from the model's real context window via executus/model), run-bounding, a per-PR budget gate (Ports.Budget), and the wrap-up nudge re-expressed as a run.Critic. Lens fan-out now uses executus/fanout. gadfly keeps its own model.go, so GADFLY_ENDPOINT_<NAME> aliases and the claude-code engine are unaffected. No majordomo bump; the binary stays static (executus core is majordomo+stdlib only). - Paginate get_diff (per-file `path` + start_line/limit) instead of dumping the whole diff; trim the recheck diff embed (60k -> 20k chars). - entrypoint.sh: downshift the fleet above GADFLY_HUGE_DIFF_BYTES (one cheap model, fewer lenses/steps, no recheck) + a swarm-wide GADFLY_PR_BUDGET_SECS wall-clock backstop (adds procps for pkill). All advisory; CI never fails. - README + CLAUDE.md + tests updated. Note: run.Result exposes no transcript, so the old transcript-based forced- finalization fallback is dropped; the wrap-up critic nudge is the remaining "always emit something" mechanism. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 10:43:34 -04:00
parent 5007597cf9
commit b860119332
19 changed files with 935 additions and 224 deletions
@@ -33,14 +33,18 @@ type reviewEngine interface {
 	runPass(ctx context.Context, system, task string, maxSteps int) (string, error)
 }

-// majordomoEngine drives the in-process majordomo agent over the repo sandbox.
+// majordomoEngine drives the in-process review path over the repo sandbox. It no
+// longer calls majordomo's agent loop directly: each pass runs through an
+// executus run.Executor (see executus.go), which adds context compaction, run
+// bounding, the per-PR budget gate, and the wrap-up critic. mdl is retained only
+// so auto-select can fall back to the review model as its selector.
 type majordomoEngine struct {
-	mdl     llm.Model
-	fsTools *repoFS
+	rex *reviewExecutor
+	mdl llm.Model
 }

 func (e *majordomoEngine) runPass(ctx context.Context, system, task string, maxSteps int) (string, error) {
-	return runAgent(ctx, e.mdl, e.fsTools, system, task, maxSteps)
+	return e.rex.run(ctx, system, task, maxSteps)
 }

 // claudeCodeEngine reviews by shelling out to the `claude` CLI (Claude Code) in
@@ -0,0 +1,364 @@
+package main
+
+// executus.go wires gadfly's agentic review path onto the executus run kernel
+// (gitea.stevedudenhoeffer.com/steve/executus), layered above majordomo. The
+// majordomoEngine no longer drives majordomo's agent loop directly; it builds a
+// run.Executor that gives gadfly, for free:
+//
+//   - context compaction (executus/compact): once the transcript a step would
+//     SEND crosses a token threshold derived from the model's real context
+//     window, the runaway middle is folded into a one-paragraph summary by a
+//     cheap summarizer model — so a big diff + accumulating read_file/grep
+//     results can't balloon every re-sent step (the large-PR burn).
+//   - run bounding + a per-PR spend budget (executus/run Ports.Budget): a hard
+//     token/seconds ceiling so a pathological PR can't drain the usage block.
+//   - the wrap-up nudge, re-expressed as an executus Critic (Ports.Critic): the
+//     steer that tells a step-hungry model to stop investigating and write its
+//     answer is now the critic seam, not a bespoke RunOption.
+//
+// Everything degrades to today's behavior when unconfigured: nil summarizer or a
+// 0 context window disables compaction; nil budget disables the ceiling; the
+// claude-code engine shells out and is unaffected by any of this.
+//
+// gadfly keeps its own model.go resolution (so GADFLY_ENDPOINT_<NAME> http
+// aliases, failover chains, and the claude-code engine all survive) — the
+// run.Executor is handed gadfly's already-resolved model via a trivial resolver,
+// not routed through executus's tier table.
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/compact"
+	"gitea.stevedudenhoeffer.com/steve/executus/model"
+	exrun "gitea.stevedudenhoeffer.com/steve/executus/run"
+	exectool "gitea.stevedudenhoeffer.com/steve/executus/tool"
+)
+
+const (
+	// defaultCompactRatio is the fraction of the model's context window at which
+	// compaction fires. It is deliberately LOWER than executus's own 0.7 default:
+	// on the large-PR burn the per-step transcript is the embedded diff (~17K) plus
+	// accumulating read_file results, which rarely reaches 0.7×262K≈183K — so a
+	// 0.7 threshold never bites. ~0.45×262K≈118K folds the runaway middle while a
+	// transcript is still well under the cap. Override with GADFLY_COMPACT_RATIO.
+	defaultCompactRatio = 0.45
+	// defaultCompactKeepRecent / defaultCompactSummaryWords mirror executus's own
+	// compactor defaults; surfaced as gadfly env knobs for tuning.
+	defaultCompactKeepRecent   = 8
+	defaultCompactSummaryWords = 200
+)
+
+// runSeq mints a unique-per-process RunID suffix for each executor run so audit
+// and the run kernel can tell one pass from another within a binary process.
+var runSeq atomic.Uint64
+
+// wrappedTool adapts an already-built majordomo llm.Tool (gadfly's sandboxed
+// read_file/grep/get_diff/… closures over the repoFS) to executus's tool.Tool
+// interface so the run kernel can build a toolbox from them by name. gadfly's
+// tools need no caller/channel identity, so BuildLLM ignores the Invocation and
+// returns the pre-built tool; Permission is the zero value (private, ungated).
+type wrappedTool struct{ t llm.Tool }
+
+func (w wrappedTool) Name() string                            { return w.t.Name }
+func (w wrappedTool) Description() string                     { return w.t.Description }
+func (w wrappedTool) Permission() exectool.Permission         { return exectool.Permission{} }
+func (w wrappedTool) BuildLLM(_ exectool.Invocation) llm.Tool { return w.t }
+
+// gadflyToolRegistry registers the repo's read-only tools (plus the optional
+// delegate_investigation worker tool) in a fresh executus tool.Registry and
+// returns it along with the tool names for RunnableAgent.LowLevelTools.
+func gadflyToolRegistry(fs *repoFS) (exectool.Registry, []string, error) {
+	reg := exectool.NewRegistry()
+	tools := fs.fsTools()
+	if fs.worker != nil {
+		tools = append(tools, fs.delegateTool())
+	}
+	names := make([]string, 0, len(tools))
+	for _, t := range tools {
+		if err := reg.Register(wrappedTool{t: t}); err != nil {
+			return nil, nil, fmt.Errorf("register tool %q: %w", t.Name, err)
+		}
+		names = append(names, t.Name)
+	}
+	return reg, names, nil
+}
+
+// gadflyBudget is gadfly's per-PR spend ceiling, satisfying run.Ports.Budget.
+// It gates a run BEFORE it makes any model call (Check) once the process has
+// spent its token or wall-clock allowance on this PR. Tokens are fed in
+// out-of-band via addUsage (the Budget interface's Commit only carries seconds);
+// the engine calls addUsage after each pass with run.Result.Usage. A nil
+// *gadflyBudget is never installed — caps of 0 mean "unlimited", so the port is
+// only wired when at least one cap is set.
+type gadflyBudget struct {
+	mu         sync.Mutex
+	maxTokens  int64
+	maxSeconds float64
+	tokens     int64
+	seconds    float64
+}
+
+// newPRBudget builds the per-PR budget from env, or nil when neither cap is set
+// (the default — the swarm-wide ceiling lives in entrypoint.sh; this is the
+// per-process belt to its suspenders).
+func newPRBudget() *gadflyBudget {
+	toks := envInt("GADFLY_PR_TOKEN_BUDGET", 0)
+	secs := envInt("GADFLY_PR_TIME_BUDGET_SECS", 0)
+	if toks <= 0 && secs <= 0 {
+		return nil
+	}
+	return &gadflyBudget{maxTokens: int64(toks), maxSeconds: float64(secs)}
+}
+
+func (b *gadflyBudget) Check(_ context.Context, _ string) error {
+	if b == nil {
+		return nil
+	}
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	if b.maxTokens > 0 && b.tokens >= b.maxTokens {
+		return fmt.Errorf("gadfly: per-PR token budget exhausted (%d/%d)", b.tokens, b.maxTokens)
+	}
+	if b.maxSeconds > 0 && b.seconds >= b.maxSeconds {
+		return fmt.Errorf("gadfly: per-PR time budget exhausted (%.0f/%.0fs)", b.seconds, b.maxSeconds)
+	}
+	return nil
+}
+
+func (b *gadflyBudget) Commit(_ context.Context, _ string, runtimeSeconds float64) {
+	if b == nil {
+		return
+	}
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.seconds += runtimeSeconds
+}
+
+// addUsage records a finished pass's token spend toward the budget. Safe on nil.
+func (b *gadflyBudget) addUsage(u llm.Usage) {
+	if b == nil {
+		return
+	}
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.tokens += int64(u.InputTokens) + int64(u.OutputTokens)
+}
+
+// wrapUpCritic re-expresses gadfly's wrap-up nudge as an executus run.Critic:
+// once a run comes within wrapUpReserve steps of its cap, Steer() injects the
+// "stop calling tools and write your final answer" message so a thorough model
+// spends its last steps finalizing instead of hard-failing empty. It sets no
+// hard deadline (Deadline()==zero) and never raises the step ceiling
+// (MaxSteps()==0, defer to the run's MaxIterations) — it is purely the nudge.
+type wrapUpCritic struct{ reserve int }
+
+func (c *wrapUpCritic) Monitor(_ context.Context, info exrun.RunInfo, _ time.Duration) exrun.CriticHandle {
+	return &wrapUpHandle{maxSteps: info.MaxIterations, reserve: c.reserve}
+}
+
+type wrapUpHandle struct {
+	mu       sync.Mutex
+	maxSteps int
+	reserve  int
+	done     int // steps completed so far
+	nudged   bool
+}
+
+func (h *wrapUpHandle) RecordStep(iter int, _ *llm.Response) {
+	h.mu.Lock()
+	h.done = iter + 1
+	h.mu.Unlock()
+}
+func (h *wrapUpHandle) RecordToolStart(string, string) {}
+func (h *wrapUpHandle) Steer() []llm.Message {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	at := h.maxSteps - h.reserve
+	if at < 1 {
+		at = 1
+	}
+	if !h.nudged && h.maxSteps > 0 && h.done >= at {
+		h.nudged = true
+		return []llm.Message{llm.UserText(wrapUpInstruction)}
+	}
+	return nil
+}
+func (h *wrapUpHandle) Deadline() time.Time { return time.Time{} }
+func (h *wrapUpHandle) MaxSteps() int       { return 0 }
+func (h *wrapUpHandle) KillCause() error    { return nil }
+func (h *wrapUpHandle) Stop()               {}
+
+// reviewExecutor bundles a run.Executor with the per-run wiring the engine needs
+// for each pass (the tool names to expose, the model spec to report as the tier,
+// the per-PR caller id, and the budget to feed token usage into).
+type reviewExecutor struct {
+	ex        *exrun.Executor
+	toolNames []string
+	modelSpec string
+	callerID  string
+	budget    *gadflyBudget
+}
+
+// newReviewExecutor builds the run.Executor for the in-process majordomo review
+// path. mdl is gadfly's already-resolved review model; summarizer is the cheap
+// model the compactor uses (nil disables compaction). Compaction also needs the
+// model's context window (resolved once here, not per pass); a 0 window likewise
+// disables it. The budget (may be nil) becomes the run.Ports.Budget gate.
+func newReviewExecutor(fs *repoFS, mdl, summarizer llm.Model, modelSpec string, budget *gadflyBudget) (*reviewExecutor, error) {
+	reg, names, err := gadflyToolRegistry(fs)
+	if err != nil {
+		return nil, err
+	}
+
+	// gadfly resolves exactly one model per process; the run kernel's resolver
+	// just hands that model back regardless of the tier string it is asked for.
+	modelsResolver := func(ctx context.Context, _ string) (context.Context, llm.Model, error) {
+		return ctx, mdl, nil
+	}
+
+	var compactor compact.CompactorFactory
+	var ctxTokens func(string) int
+	if summarizer != nil && compactionEnabled() {
+		if window := resolveContextTokens(modelSpec); window > 0 {
+			sumResolver := func(ctx context.Context, _ string) (context.Context, llm.Model, error) {
+				return ctx, summarizer, nil
+			}
+			compactor = compact.NewCompactor(compact.CompactorConfig{
+				Models:         sumResolver,
+				KeepRecent:     envInt("GADFLY_COMPACT_KEEP_RECENT", defaultCompactKeepRecent),
+				SummaryWordCap: envInt("GADFLY_COMPACT_SUMMARY_WORDS", defaultCompactSummaryWords),
+			})
+			ctxTokens = func(string) int { return window } // memoized: one window per process
+		}
+	}
+
+	var ports exrun.Ports
+	ports.Critic = &wrapUpCritic{reserve: wrapUpReserve()}
+	if budget != nil {
+		ports.Budget = budget
+	}
+
+	cfg := exrun.Config{
+		Registry:      reg,
+		Models:        modelsResolver,
+		Compactor:     compactor,
+		ContextTokens: ctxTokens,
+		Defaults: exrun.Defaults{
+			MaxIterations:            envInt("GADFLY_MAX_STEPS", defaultMaxSteps),
+			MaxRuntime:               time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second,
+			MaxConsecutiveToolErrors: 4,
+			MaxSameToolCallRepeats:   4,
+			CompactionThresholdRatio: compactRatio(),
+			FallbackTier:             modelSpec,
+		},
+		Ports: ports,
+	}
+	return &reviewExecutor{
+		ex:        exrun.New(cfg),
+		toolNames: names,
+		modelSpec: modelSpec,
+		callerID:  prCallerID(),
+		budget:    budget,
+	}, nil
+}
+
+// run executes one agent pass (review or recheck) through the run kernel and
+// returns the model's final text. An empty answer with no error is reported as
+// an error so the caller (reviewWithSpecialist) renders the advisory "reviewer
+// failed to complete" notice rather than a blank section.
+func (r *reviewExecutor) run(ctx context.Context, system, task string, maxSteps int) (string, error) {
+	res := r.ex.Run(ctx, exrun.RunnableAgent{
+		Name:          "gadfly-review",
+		SystemPrompt:  system,
+		ModelTier:     r.modelSpec,
+		MaxIterations: maxSteps,
+		MaxRuntime:    time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second,
+		LowLevelTools: r.toolNames,
+		Critic:        exrun.CriticConfig{Enabled: true},
+	}, exectool.Invocation{
+		RunID:    fmt.Sprintf("gadfly-%d", runSeq.Add(1)),
+		CallerID: r.callerID,
+	}, task)
+
+	// Feed token spend toward the per-PR budget out-of-band (Commit carries only
+	// seconds; the executor already called it). Safe on a nil budget.
+	r.budget.addUsage(res.Usage)
+
+	if res.Err != nil {
+		return "", res.Err
+	}
+	if out := strings.TrimSpace(res.Output); out != "" {
+		return out, nil
+	}
+	return "", errors.New("agent produced no output")
+}
+
+// prCallerID is the budget/audit caller key: the repo + PR, so a budget keyed on
+// it is naturally per-PR. Falls back to "local" for an out-of-CI run.
+func prCallerID() string {
+	repo := strings.TrimSpace(os.Getenv("GADFLY_REPO"))
+	pr := strings.TrimSpace(os.Getenv("GADFLY_PR"))
+	if repo == "" && pr == "" {
+		return "local"
+	}
+	return repo + "#" + pr
+}
+
+// compactionEnabled reports whether context compaction should be wired. On
+// unless GADFLY_COMPACT is explicitly falsey.
+func compactionEnabled() bool {
+	switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_COMPACT"))) {
+	case "0", "false", "no", "off":
+		return false
+	default:
+		return true
+	}
+}
+
+// compactRatio is the compaction threshold as a fraction of the model context
+// window (GADFLY_COMPACT_RATIO), clamped to (0,1]; default defaultCompactRatio.
+func compactRatio() float64 {
+	v := strings.TrimSpace(os.Getenv("GADFLY_COMPACT_RATIO"))
+	if v == "" {
+		return defaultCompactRatio
+	}
+	f, err := strconv.ParseFloat(v, 64)
+	if err != nil || f <= 0 || f > 1 {
+		return defaultCompactRatio
+	}
+	return f
+}
+
+// resolveContextTokens returns the review model's context window in tokens, used
+// to set the compaction threshold. GADFLY_MODEL_CONTEXT_TOKENS overrides it
+// (needed for custom/self-hosted endpoints executus can't introspect); otherwise
+// it asks executus/model, which knows the static catalog and can fetch an
+// Ollama Cloud model's limit via /api/show (one call, at executor-build time).
+// Returns 0 — disabling compaction — for an unknown model, mirroring executus's
+// "unknown ⇒ don't budget" contract.
+func resolveContextTokens(modelSpec string) int {
+	if v := envInt("GADFLY_MODEL_CONTEXT_TOKENS", 0); v > 0 {
+		return v
+	}
+	key := strings.TrimSpace(os.Getenv("GADFLY_API_KEY"))
+	if key == "" {
+		key = strings.TrimSpace(os.Getenv("OLLAMA_API_KEY"))
+	}
+	cache := model.NewCloudOllamaLimitCache("", key, nil)
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+	if n, ok := model.MaxContextTokensResolving(ctx, modelSpec, cache); ok {
+		return n
+	}
+	return 0
+}
@@ -0,0 +1,140 @@
+package main
+
+import (
+	"context"
+	"testing"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+func TestGadflyBudget(t *testing.T) {
+	ctx := context.Background()
+
+	// A nil budget never blocks and never panics.
+	var nilB *gadflyBudget
+	if err := nilB.Check(ctx, "pr"); err != nil {
+		t.Errorf("nil budget Check should be nil, got %v", err)
+	}
+	nilB.Commit(ctx, "pr", 10)
+	nilB.addUsage(llm.Usage{InputTokens: 5})
+
+	// Token ceiling: passes until usage crosses it.
+	b := &gadflyBudget{maxTokens: 100}
+	if err := b.Check(ctx, "pr"); err != nil {
+		t.Fatalf("fresh budget should pass, got %v", err)
+	}
+	b.addUsage(llm.Usage{InputTokens: 60, OutputTokens: 50}) // 110 >= 100
+	if err := b.Check(ctx, "pr"); err == nil {
+		t.Error("budget over the token cap should reject the next run")
+	}
+
+	// Seconds ceiling, accumulated via Commit.
+	s := &gadflyBudget{maxSeconds: 30}
+	s.Commit(ctx, "pr", 31)
+	if err := s.Check(ctx, "pr"); err == nil {
+		t.Error("budget over the time cap should reject the next run")
+	}
+}
+
+func TestNewPRBudget(t *testing.T) {
+	t.Setenv("GADFLY_PR_TOKEN_BUDGET", "")
+	t.Setenv("GADFLY_PR_TIME_BUDGET_SECS", "")
+	if newPRBudget() != nil {
+		t.Error("no caps set should yield a nil (disabled) budget")
+	}
+	t.Setenv("GADFLY_PR_TOKEN_BUDGET", "1000")
+	if b := newPRBudget(); b == nil || b.maxTokens != 1000 {
+		t.Errorf("token cap should build a budget with maxTokens=1000, got %+v", b)
+	}
+}
+
+func TestCompactRatio(t *testing.T) {
+	t.Setenv("GADFLY_COMPACT_RATIO", "")
+	if got := compactRatio(); got != defaultCompactRatio {
+		t.Errorf("default ratio = %v, want %v", got, defaultCompactRatio)
+	}
+	t.Setenv("GADFLY_COMPACT_RATIO", "0.6")
+	if got := compactRatio(); got != 0.6 {
+		t.Errorf("ratio override = %v, want 0.6", got)
+	}
+	for _, bad := range []string{"0", "-1", "2", "nope"} {
+		t.Setenv("GADFLY_COMPACT_RATIO", bad)
+		if got := compactRatio(); got != defaultCompactRatio {
+			t.Errorf("invalid ratio %q should fall back to the default, got %v", bad, got)
+		}
+	}
+}
+
+func TestResolveContextTokensOverride(t *testing.T) {
+	// The explicit override short-circuits any model introspection (no network).
+	t.Setenv("GADFLY_MODEL_CONTEXT_TOKENS", "123456")
+	if got := resolveContextTokens("anything"); got != 123456 {
+		t.Errorf("explicit context-token override = %d, want 123456", got)
+	}
+}
+
+func TestPRCallerID(t *testing.T) {
+	t.Setenv("GADFLY_REPO", "")
+	t.Setenv("GADFLY_PR", "")
+	if got := prCallerID(); got != "local" {
+		t.Errorf("no repo/PR should be %q, got %q", "local", got)
+	}
+	t.Setenv("GADFLY_REPO", "steve/mort")
+	t.Setenv("GADFLY_PR", "1367")
+	if got := prCallerID(); got != "steve/mort#1367" {
+		t.Errorf("callerID = %q, want steve/mort#1367", got)
+	}
+}
+
+func TestCompactionEnabled(t *testing.T) {
+	for _, v := range []string{"", "1", "true", "yes"} {
+		t.Setenv("GADFLY_COMPACT", v)
+		if !compactionEnabled() {
+			t.Errorf("GADFLY_COMPACT=%q should be enabled", v)
+		}
+	}
+	for _, v := range []string{"0", "false", "no", "off"} {
+		t.Setenv("GADFLY_COMPACT", v)
+		if compactionEnabled() {
+			t.Errorf("GADFLY_COMPACT=%q should be disabled", v)
+		}
+	}
+}
+
+func TestGadflyToolRegistry(t *testing.T) {
+	fs, err := newRepoFS(t.TempDir(), "diff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, names, err := gadflyToolRegistry(fs)
+	if err != nil {
+		t.Fatalf("gadflyToolRegistry: %v", err)
+	}
+	want := map[string]bool{"read_file": true, "list_dir": true, "grep": true, "find_files": true, "get_diff": true}
+	has := func(n string) bool {
+		for _, g := range names {
+			if g == n {
+				return true
+			}
+		}
+		return false
+	}
+	for n := range want {
+		if !has(n) {
+			t.Errorf("registry missing tool %q (got %v)", n, names)
+		}
+	}
+	if has("delegate_investigation") {
+		t.Error("delegate_investigation must be absent without a worker model")
+	}
+
+	// With a worker model the delegate tool is registered too.
+	fs.worker = fakeModel(t, "x")
+	_, names, err = gadflyToolRegistry(fs)
+	if err != nil {
+		t.Fatalf("gadflyToolRegistry with worker: %v", err)
+	}
+	if !has("delegate_investigation") {
+		t.Errorf("delegate_investigation should be registered with a worker model, got %v", names)
+	}
+}
@@ -104,7 +104,7 @@ func TestRunSpecialists_FansOut(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")
+	results := runSpecialists(testEngine(t, mdl, fs), "sys", specs, "task", "diff")

 	if got := peak(); got != 3 {
 		t.Errorf("peak concurrent lenses = %d, want 3", got)
@@ -124,7 +124,7 @@ func TestRunSpecialists_SequentialByDefault(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")
+	results := runSpecialists(testEngine(t, mdl, fs), "sys", specs, "task", "diff")

 	if got := peak(); got != 1 {
 		t.Errorf("peak concurrent lenses = %d, want 1 (sequential by default)", got)
@@ -146,7 +146,7 @@ func TestRunSpecialists_PerProviderFanOut(t *testing.T) {
 	}
 	specs := threeLenses()

-	results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff")
+	results := runSpecialists(testEngine(t, mdl, fs), "sys", specs, "task", "diff")

 	if got := peak(); got != 3 {
 		t.Errorf("peak concurrent lenses = %d, want 3 (m1 per-provider override)", got)
@@ -70,11 +70,9 @@ import (
 	"os"
 	"strconv"
 	"strings"
-	"sync"
 	"time"

-	"gitea.stevedudenhoeffer.com/steve/majordomo/agent"
-	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/executus/fanout"
 )

 const (
@@ -107,13 +105,6 @@ const wrapUpInstruction = "⚠️ You are almost out of your investigation budge
 	"Do not begin any new investigation. If a finding could not be confirmed, drop it or mark it explicitly as unverified. " +
 	"Output the review in the required format right now."

-// finalizeInstruction is the user message sent on the tool-free fallback pass
-// when the agent exhausted its budget (or tripped a loop guard) without ever
-// producing a final answer. It forces the model to synthesize whatever it has.
-const finalizeInstruction = "You have run out of investigation steps. Do NOT call any tools. " +
-	"Based solely on what you have already gathered above, write your final answer now in the required format. " +
-	"If you could not confirm some findings, omit them or mark them as unverified, but produce the answer."
-
 func main() {
 	if err := run(); err != nil {
 		fmt.Fprintln(os.Stderr, "gadfly:", err)
@@ -177,7 +168,18 @@ func run() error {
 		} else if worker != nil {
 			fsTools.worker = worker
 		}
-		eng = &majordomoEngine{mdl: mdl, fsTools: fsTools}
+		// The context compactor needs a cheap summarizer; reuse the worker model
+		// when present, else the review model. A bad explicit GADFLY_COMPACT_MODEL
+		// just disables compaction rather than sinking the review.
+		summarizer, serr := resolveSummarizerModel(mdl, fsTools.worker)
+		if serr != nil {
+			fmt.Fprintln(os.Stderr, "gadfly: compaction summarizer disabled:", serr)
+		}
+		rex, rerr := newReviewExecutor(fsTools, mdl, summarizer, os.Getenv("GADFLY_MODEL"), newPRBudget())
+		if rerr != nil {
+			return fmt.Errorf("build review executor: %w", rerr)
+		}
+		eng = &majordomoEngine{rex: rex, mdl: mdl}
 	}

 	specialists, registry, auto, serrs := resolveSpecialists(repoDir)
@@ -231,55 +233,55 @@ func run() error {
 }

 // runSpecialists reviews the diff through each lens and returns the results in
-// the SAME order as specialists, regardless of finish order. Up to
-// GADFLY_LENS_CONCURRENCY lenses run concurrently; the default of 1 keeps the
-// suite sequential, exactly as before. Each lens already runs under its own
-// per-lens timeout (reviewWithSpecialist), so concurrency simply overlaps those
-// independent passes — and because reviewWithSpecialist builds a fresh toolbox
-// per pass and the lenses only read the immutable repoFS, they share no mutable
-// state. Results are stored by index so the consolidated comment keeps the
-// configured lens order.
+// the SAME order as specialists, regardless of finish order. It uses executus's
+// fanout primitive: up to GADFLY_LENS_CONCURRENCY lenses run concurrently (the
+// default of 1 keeps the suite sequential, exactly as before), and fanout.Run
+// returns one result per lens in input order. Each lens already runs under its
+// own per-lens timeout (reviewWithSpecialist) and the lenses only read the
+// immutable repoFS, so concurrency simply overlaps independent passes.
 //
 // Caution: this fans out WITHIN one model. It multiplies with entrypoint.sh's
 // per-provider model concurrency, so total concurrent backend requests ≈
 // (models at once) × (lenses at once). To fan lenses out without oversubscribing
 // the backend, run models one at a time (provider lane cap 1) and raise this.
 func runSpecialists(eng reviewEngine, base string, specialists []Specialist, task, diff string) []specialistResult {
-	results := make([]specialistResult, len(specialists))
-
 	// Optional live status board: publishes this model's per-lens progress to a
 	// file the entrypoint board renders. Inert (no-op) unless GADFLY_STATUS_FILE
 	// is set, so plain runs are unaffected.
 	sw := newStatusWriter(os.Getenv("GADFLY_MODEL"), modelProvider(), specialists)

-	conc := min(lensConcurrency(), len(specialists))
+	fanResults := fanout.Run(context.Background(), specialists, fanout.Options[Specialist]{
+		MaxConcurrent: lensConcurrency(),
+	}, func(_ context.Context, sp Specialist) (res specialistResult, _ error) {
+		// A panic in one lens must not crash the whole binary (which would kill
+		// every other lens's output) or leave this lens stuck at "running" on the
+		// status board. fanout does not recover fn panics, so we do it here:
+		// record the panic as an errored result and mark the lens finished.
+		defer func() {
+			if r := recover(); r != nil {
+				res = specialistResult{spec: sp, out: fmt.Sprintf("⚠️ This reviewer panicked: %v", r), verdict: verdictUnknown, errored: true}
+				sw.set(sp.Name, lensFinished, "", true)
+			}
+		}()
+		sw.set(sp.Name, lensRunning, "", false)
+		out, errored := reviewWithSpecialist(eng, base, sp, task, diff)
+		v := parseVerdict(out)
+		sw.set(sp.Name, lensFinished, v.label(), errored)
+		return specialistResult{spec: sp, out: out, verdict: v, errored: errored}, nil
+	})

-	sem := make(chan struct{}, conc)
-	var wg sync.WaitGroup
-	for i, sp := range specialists {
-		wg.Add(1)
-		sem <- struct{}{} // blocks once `conc` lenses are already in flight
-		go func(i int, sp Specialist) {
-			defer wg.Done()
-			defer func() { <-sem }()
-			// A panic in one lens must not crash the whole binary (which would
-			// kill every other lens's output) or leave this lens stuck at
-			// "running" on the status board. Recover, record it as an errored
-			// result, and mark the lens finished so the board can complete.
-			defer func() {
-				if r := recover(); r != nil {
-					results[i] = specialistResult{spec: sp, out: fmt.Sprintf("⚠️ This reviewer panicked: %v", r), verdict: verdictUnknown, errored: true}
-					sw.set(sp.Name, lensFinished, "", true)
-				}
-			}()
-			sw.set(sp.Name, lensRunning, "", false)
-			out, errored := reviewWithSpecialist(eng, base, sp, task, diff)
-			v := parseVerdict(out)
-			results[i] = specialistResult{spec: sp, out: out, verdict: v, errored: errored}
-			sw.set(sp.Name, lensFinished, v.label(), errored)
-		}(i, sp)
+	// fanout guarantees input order; its Result.Err is set only when the context
+	// is cancelled before a lens ran (reviewWithSpecialist embeds its own failures
+	// in the result), so surface that as an errored lens rather than dropping it.
+	results := make([]specialistResult, len(specialists))
+	for i, r := range fanResults {
+		if r.Err != nil {
+			results[i] = specialistResult{spec: specialists[i], out: fmt.Sprintf("⚠️ This reviewer did not run: %v", r.Err), verdict: verdictUnknown, errored: true}
+			sw.set(specialists[i].Name, lensFinished, "", true)
+			continue
+		}
+		results[i] = r.Value
 	}
-	wg.Wait()
 	return results
 }

@@ -344,90 +346,6 @@ func reviewWithSpecialist(eng reviewEngine, base string, sp Specialist, task, di
 	return final, false
 }

-// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
-// returns the final answer. An empty answer is an error — the caller decides
-// whether that is fatal (review pass) or recoverable (recheck pass). A
-// non-empty answer that ended on a budget/guard error is still returned: the
-// model wrote its output, then ran out of steps.
-//
-// Two mechanisms keep a step-hungry model from hard-failing with no output:
-//  1. A wrap-up steer: once the run comes within wrapUpReserve steps of the
-//     cap, a forceful "stop calling tools, write your final answer" message is
-//     injected so the model spends its remaining steps finalizing.
-//  2. A finalization fallback: if the loop still ends empty (the model ignored
-//     the nudge, or a loop guard tripped), one tool-free model call forces a
-//     final answer out of the transcript already gathered.
-func runAgent(ctx context.Context, mdl llm.Model, fsTools *repoFS, system, task string, maxSteps int) (string, error) {
-	box, err := fsTools.toolbox()
-	if err != nil {
-		return "", err
-	}
-	loop := agent.New(mdl, system,
-		agent.WithToolbox(box),
-		agent.WithMaxSteps(maxSteps),
-		// Guard rails: stop the model from spinning on failing or identical
-		// tool calls instead of writing its answer.
-		agent.WithToolErrorLimits(4, 4),
-	)
-
-	wrapUpAt := maxSteps - wrapUpReserve()
-	if wrapUpAt < 1 {
-		wrapUpAt = 1
-	}
-	var completed int // steps finished so far (updated after each step)
-	nudged := false
-
-	res, runErr := loop.Run(ctx, task,
-		agent.OnStep(func(s agent.Step) { completed = s.Index + 1 }),
-		agent.WithSteer(func() []llm.Message {
-			if !nudged && completed >= wrapUpAt {
-				nudged = true
-				return []llm.Message{llm.UserText(wrapUpInstruction)}
-			}
-			return nil
-		}),
-	)
-
-	out := ""
-	if res != nil {
-		out = strings.TrimSpace(res.Output)
-	}
-	if out != "" {
-		return out, nil
-	}
-
-	// No final answer. If we still have budget on the clock and a transcript to
-	// work from, force a tool-free finalization rather than losing the pass.
-	if res != nil && len(res.Messages) > 0 && ctx.Err() == nil {
-		if forced := forceFinalAnswer(ctx, mdl, system, res.Messages); forced != "" {
-			return forced, nil
-		}
-	}
-
-	if runErr != nil {
-		return "", runErr
-	}
-	return "", errors.New("agent produced no output")
-}
-
-// forceFinalAnswer makes one tool-free model call to squeeze a final answer out
-// of an agent that exhausted its step budget without producing one. Tools are
-// forbidden (ToolChoice "none") so the model must synthesize from the transcript
-// instead of investigating further. Best-effort: any error or empty reply
-// returns "" and the caller falls back to its normal empty-output handling.
-func forceFinalAnswer(ctx context.Context, mdl llm.Model, system string, transcript []llm.Message) string {
-	msgs := append(append([]llm.Message(nil), transcript...), llm.UserText(finalizeInstruction))
-	resp, err := mdl.Generate(ctx, llm.Request{
-		System:     system,
-		Messages:   msgs,
-		ToolChoice: "none",
-	})
-	if err != nil || resp == nil {
-		return ""
-	}
-	return strings.TrimSpace(resp.Text())
-}
-
 // wrapUpReserve is how many steps before the cap the wrap-up nudge fires,
 // overridable via GADFLY_WRAPUP_RESERVE.
 func wrapUpReserve() int {
@@ -444,7 +362,7 @@ func buildTask(diff string) string {
 	truncNote := ""
 	if maxDiff > 0 && len(diff) > maxDiff {
 		diff = diff[:maxDiff]
-		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; read the changed files (or call get_diff, if available) for the full text.]", maxDiff)
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; page the full diff with get_diff (paginated; pass a `path` to scope it to one file) or read the changed files.]", maxDiff)
 	}

 	var b strings.Builder
@@ -127,6 +127,27 @@ func resolveWorkerModel() (llm.Model, error) {
 	return majordomo.Parse(buildSpec(provider, spec))
 }

+// resolveSummarizerModel picks the model the context compactor uses to compress
+// the runaway middle of a transcript. It should be CHEAP, since it fires once per
+// compaction: GADFLY_COMPACT_MODEL if set (honoring GADFLY_PROVIDER for a bare
+// id), else the delegate worker model when one is configured (already cheap by
+// design), else the review model itself. Returns (nil, err) only on an explicit
+// bad GADFLY_COMPACT_MODEL spec — the caller logs it and simply runs without
+// compaction rather than failing the review.
+func resolveSummarizerModel(review, worker llm.Model) (llm.Model, error) {
+	if spec := strings.TrimSpace(os.Getenv("GADFLY_COMPACT_MODEL")); spec != "" {
+		provider := strings.TrimSpace(os.Getenv("GADFLY_PROVIDER"))
+		if provider == "" {
+			provider = defaultProvider
+		}
+		return majordomo.Parse(buildSpec(provider, spec))
+	}
+	if worker != nil {
+		return worker, nil
+	}
+	return review, nil
+}
+
 // buildSpec turns (provider, model) into a majordomo spec. A model id that
 // already carries a "provider/" prefix (or is a multi-element failover chain)
 // is passed through verbatim; a bare id is prefixed with the provider.
@@ -11,6 +11,13 @@ import (
 // than discovering them.
 const defaultRecheckMaxSteps = 16

+// defaultRecheckDiffChars caps the diff embedded in the recheck task. It is much
+// smaller than the review task's GADFLY_MAX_DIFF_CHARS: the recheck already has
+// the draft findings to verify and can pull the exact hunks it needs via the
+// paginated get_diff tool (optionally scoped to a path), so re-embedding the
+// whole diff on every recheck step is pure burn. Override: GADFLY_RECHECK_DIFF_CHARS.
+const defaultRecheckDiffChars = 20000
+
 // recheckSystemPrompt drives the second, adversarial verification pass. The
 // model is given a DRAFT review and must independently confirm each finding
 // against the real code before letting it survive — the antidote to a
@@ -84,11 +91,11 @@ func shouldRecheck(draft string) bool {
 // scrutinize, with the full diff available via get_diff (and embedded here,
 // truncated, to save a tool call).
 func buildRecheckTask(draft, diff string) string {
-	maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars)
+	maxDiff := envInt("GADFLY_RECHECK_DIFF_CHARS", defaultRecheckDiffChars)
 	truncNote := ""
 	if maxDiff > 0 && len(diff) > maxDiff {
 		diff = diff[:maxDiff]
-		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; read the changed files (or call get_diff, if available) for the full text.]", maxDiff)
+		truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff (paginated; pass a `path` to scope it to one file) or read the changed files for the rest.]", maxDiff)
 	}

 	var b strings.Builder
@@ -77,25 +77,46 @@ func fakeModel(t *testing.T, reply string) llm.Model {
 	return m
 }

-func TestRunAgent_ReturnsOutput(t *testing.T) {
+// newTestReviewExecutor builds a reviewExecutor over a fake model + repo for unit
+// tests: no compaction summarizer and no budget, so it exercises the bare agent
+// loop through the executus run kernel without any network call.
+func newTestReviewExecutor(t *testing.T, mdl llm.Model, fs *repoFS) *reviewExecutor {
+	t.Helper()
+	rex, err := newReviewExecutor(fs, mdl, nil, "mock", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	return rex
+}
+
+// testEngine wraps newTestReviewExecutor in a majordomoEngine for the
+// runSpecialists-level tests.
+func testEngine(t *testing.T, mdl llm.Model, fs *repoFS) *majordomoEngine {
+	t.Helper()
+	return &majordomoEngine{rex: newTestReviewExecutor(t, mdl, fs), mdl: mdl}
+}
+
+func TestReviewExecutor_ReturnsOutput(t *testing.T) {
 	fs, err := newRepoFS(t.TempDir(), "diff")
 	if err != nil {
 		t.Fatal(err)
 	}
 	mdl := fakeModel(t, "  corrected review: No material issues found.  ")
-	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4)
+	rex := newTestReviewExecutor(t, mdl, fs)
+	out, err := rex.run(context.Background(), "sys", "task", 4)
 	if err != nil {
-		t.Fatalf("runAgent: %v", err)
+		t.Fatalf("run: %v", err)
 	}
 	if out != "corrected review: No material issues found." {
-		t.Errorf("runAgent should return trimmed model output, got %q", out)
+		t.Errorf("run should return trimmed model output, got %q", out)
 	}
 }

-func TestRunAgent_EmptyIsError(t *testing.T) {
+func TestReviewExecutor_EmptyIsError(t *testing.T) {
 	fs, _ := newRepoFS(t.TempDir(), "diff")
 	mdl := fakeModel(t, "   ")
-	if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
-		t.Error("runAgent should error on empty model output")
+	rex := newTestReviewExecutor(t, mdl, fs)
+	if _, err := rex.run(context.Background(), "sys", "task", 4); err == nil {
+		t.Error("run should error on empty model output")
 	}
 }
@@ -17,11 +17,13 @@ import (
 // every tool caps how much it can pull in one call — a runaway read_file or
 // grep would blow the window and stall the loop.
 const (
-	maxFileBytes   = 64 * 1024 // per read_file call
-	maxReadLines   = 800       // per read_file call
-	maxGrepResults = 200       // per grep call
-	maxFindResults = 200       // per find_files call
-	maxLineLen     = 400       // truncate any single returned line to this
+	maxFileBytes    = 64 * 1024 // per read_file call
+	maxReadLines    = 800       // per read_file call
+	maxGrepResults  = 200       // per grep call
+	maxFindResults  = 200       // per find_files call
+	maxLineLen      = 400       // truncate any single returned line to this
+	maxGetDiffLines = 800       // per get_diff call (paginated window)
+	maxGetDiffBytes = 64 * 1024 // per get_diff call
 )

 // skipDirs are never descended into by grep / find_files — noise and bulk that
@@ -396,15 +398,86 @@ func (r *repoFS) findFilesTool() llm.Tool {
 	)
 }

+type getDiffArgs struct {
+	Path      string `json:"path,omitempty" description:"Optional changed-file path (e.g. pkg/foo/bar.go); returns ONLY that file's diff hunks. Omit for the whole diff. Use this on a large PR to pull just the file a finding is about."`
+	StartLine int    `json:"start_line,omitempty" description:"Optional 1-based line to start from within the (whole or path-scoped) diff (default 1)."`
+	Limit     int    `json:"limit,omitempty" description:"Optional max number of diff lines to return (default/maximum 800)."`
+}
+
 func (r *repoFS) getDiffTool() llm.Tool {
-	return llm.DefineTool[struct{}](
+	return llm.DefineTool[getDiffArgs](
 		"get_diff",
-		"Return the complete unified diff under review. The diff is also included (possibly truncated) in the task message; call this to get the full, untruncated text.",
-		func(_ context.Context, _ struct{}) (any, error) {
-			if strings.TrimSpace(r.diff) == "" {
+		"Return the unified diff under review as a numbered, PAGINATED window (like read_file) — not the whole diff at once, so a huge PR can't blow the context window. Pass `path` to fetch just one changed file's hunks, or `start_line`/`limit` to page through. A truncated copy of the diff is also embedded in the task message.",
+		func(_ context.Context, args getDiffArgs) (any, error) {
+			diff := r.diff
+			scope := "the diff"
+			if p := strings.TrimSpace(args.Path); p != "" {
+				diff = diffForPath(r.diff, p)
+				if strings.TrimSpace(diff) == "" {
+					return fmt.Sprintf("(no diff hunks for path %q; check it against the changed-files list — the path must match a file the diff touches)", p), nil
+				}
+				scope = "the diff for " + p
+			}
+			if strings.TrimSpace(diff) == "" {
 				return "(empty diff)", nil
 			}
-			return r.diff, nil
+			return windowDiff(diff, scope, args.StartLine, args.Limit), nil
 		},
 	)
 }
+
+// windowDiff returns a numbered, paginated slice of a unified diff, mirroring
+// read_file's caps (maxGetDiffLines / maxGetDiffBytes / maxLineLen) so a single
+// get_diff call can never dump a multi-hundred-KB diff into the transcript — the
+// amplifier behind the large-PR token burn. The full diff stays reachable by
+// paging with start_line, or scoped per file via the path arg.
+func windowDiff(diff, scope string, start, limit int) string {
+	lines := strings.Split(diff, "\n")
+	total := len(lines)
+	if start < 1 {
+		start = 1
+	}
+	if limit <= 0 || limit > maxGetDiffLines {
+		limit = maxGetDiffLines
+	}
+	if start > total {
+		return fmt.Sprintf("(%s has %d lines; nothing at/after line %d)", scope, total, start)
+	}
+	var b strings.Builder
+	emitted := 0
+	i := start - 1
+	for ; i < total; i++ {
+		if emitted >= limit || b.Len() >= maxGetDiffBytes {
+			break
+		}
+		line := lines[i]
+		if len(line) > maxLineLen {
+			line = line[:maxLineLen] + "…"
+		}
+		fmt.Fprintf(&b, "%d\t%s\n", i+1, line)
+		emitted++
+	}
+	if i < total {
+		fmt.Fprintf(&b, "... (%s truncated at line %d of %d; call get_diff again with start_line=%d for more, or pass a `path` to scope to one file)\n", scope, i, total, i+1)
+	}
+	return b.String()
+}
+
+// diffForPath extracts the unified-diff section for one changed file: from the
+// `diff --git` header that names the path through to the next `diff --git`
+// header (or end). Matching is a substring on the header line, so either side
+// (a/<path> or b/<path>) hits, including a rename's two paths.
+func diffForPath(diff, path string) string {
+	var b strings.Builder
+	inSection := false
+	for _, ln := range strings.Split(diff, "\n") {
+		if strings.HasPrefix(ln, "diff --git ") {
+			inSection = strings.Contains(ln, path)
+		}
+		if inSection {
+			b.WriteString(ln)
+			b.WriteByte('\n')
+		}
+	}
+	return b.String()
+}
@@ -197,15 +197,69 @@ func TestFindFilesTool(t *testing.T) {

 func TestGetDiffTool(t *testing.T) {
 	root := buildFixtureRepo(t)
-	const diff = "diff --git a/x b/x\n+added line\n"
+	const diff = "diff --git a/x b/x\n--- a/x\n+++ b/x\n+added line\n" +
+		"diff --git a/y.go b/y.go\n--- a/y.go\n+++ b/y.go\n+y change\n"
 	fs, _ := newRepoFS(root, diff)

+	// Default: the whole diff as a NUMBERED window (paginated), not a raw dump —
+	// so a huge PR can't be poured into the transcript in one call.
+	out, err := call(t, fs, "get_diff", map[string]any{})
+	if err != nil {
+		t.Fatalf("get_diff: %v", err)
+	}
+	if !strings.Contains(out, "1\tdiff --git a/x b/x") {
+		t.Errorf("get_diff should return a numbered window, got:\n%s", out)
+	}
+	if !strings.Contains(out, "+added line") || !strings.Contains(out, "+y change") {
+		t.Errorf("the full (short) diff window should include every hunk, got:\n%s", out)
+	}
+
+	// path filter: only the named file's hunks come back.
+	out, err = call(t, fs, "get_diff", map[string]any{"path": "y.go"})
+	if err != nil {
+		t.Fatalf("get_diff path: %v", err)
+	}
+	if !strings.Contains(out, "y change") {
+		t.Errorf("get_diff path=y.go should include y's hunk, got:\n%s", out)
+	}
+	if strings.Contains(out, "added line") {
+		t.Errorf("get_diff path=y.go must NOT include x's hunk, got:\n%s", out)
+	}
+
+	// unknown path: a clear note, never an error.
+	out, err = call(t, fs, "get_diff", map[string]any{"path": "nope.txt"})
+	if err != nil {
+		t.Fatalf("get_diff unknown path: %v", err)
+	}
+	if !strings.Contains(out, "no diff hunks") {
+		t.Errorf("get_diff for an unknown path should note no hunks, got:\n%s", out)
+	}
+}
+
+// TestGetDiffTool_Paginates: a diff longer than the per-call line cap is returned
+// as a truncated window with a paging hint, and start_line pages past it — the
+// mechanism that stops get_diff from dumping a multi-hundred-KB diff at once.
+func TestGetDiffTool_Paginates(t *testing.T) {
+	diff := "diff --git a/big b/big\n" + strings.Repeat("+line\n", maxGetDiffLines+50)
+	fs, _ := newRepoFS(t.TempDir(), diff)
+
 	out, err := call(t, fs, "get_diff", map[string]any{})
 	if err != nil {
 		t.Fatalf("get_diff: %v", err)
 	}
-	if out != diff {
-		t.Errorf("get_diff returned %q, want %q", out, diff)
+	if !strings.Contains(out, "truncated at line") {
+		t.Error("a diff longer than the per-call cap should be truncated with a paging hint")
+	}
+	if strings.Contains(out, "801\t") {
+		t.Error("the first window must stop at the line cap, not reach line 801")
+	}
+
+	out, err = call(t, fs, "get_diff", map[string]any{"start_line": 805})
+	if err != nil {
+		t.Fatalf("get_diff page: %v", err)
+	}
+	if !strings.Contains(out, "805\t") {
+		t.Error("paging with start_line=805 should include line 805")
 	}
 }

@@ -37,10 +37,11 @@ func lastUserText(req llm.Request) string {
 	return req.Messages[len(req.Messages)-1].Text()
 }

-// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until
-// it is nudged to wrap up should still finish inside its budget — the steer
-// message arrives a few steps before the cap and the model writes its answer.
-func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
+// TestReviewExecutor_WrapUpNudgeProducesAnswer: a model that keeps calling tools
+// until it is nudged to wrap up should still finish inside its budget — the steer
+// message (delivered by the executus wrap-up critic a few steps before the cap)
+// arrives and the model writes its answer.
+func TestReviewExecutor_WrapUpNudgeProducesAnswer(t *testing.T) {
 	t.Setenv("GADFLY_WRAPUP_RESERVE", "4")

 	final := "VERDICT: No material issues found."
@@ -60,9 +61,10 @@ func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
 	}
 	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")

-	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12)
+	rex := newTestReviewExecutor(t, mdl, fs)
+	out, err := rex.run(context.Background(), "sys", "task", 12)
 	if err != nil {
-		t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err)
+		t.Fatalf("run should succeed via wrap-up nudge, got error: %v", err)
 	}
 	if out != final {
 		t.Errorf("expected final review %q, got %q", final, out)
@@ -72,24 +74,19 @@ func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) {
 	}
 }

-// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and
-// spins on tools until the cap should NOT hard-fail — the tool-free finalization
-// pass forces a final answer out of the transcript.
-func TestRunAgent_FinalizationFallback(t *testing.T) {
+// TestReviewExecutor_ExhaustionWithoutAnswerIsError: a model that ignores the
+// wrap-up nudge and spins on tools until the step cap produces no final answer.
+// The transcript-based forced-finalization fallback was removed in the executus
+// re-platform (run.Result does not expose the loop transcript), so the pass now
+// surfaces an error — which reviewWithSpecialist renders as an advisory "reviewer
+// failed to complete" notice rather than a phantom success.
+func TestReviewExecutor_ExhaustionWithoutAnswerIsError(t *testing.T) {
 	t.Setenv("GADFLY_WRAPUP_RESERVE", "2")

-	final := "VERDICT: Minor issues\n- something"
-	forcedCalled := false
 	n := 0
-	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
-		// Only the tool-free finalization pass forbids tools — reply there.
-		if req.ToolChoice == "none" {
-			forcedCalled = true
-			return fake.Reply(final)
-		}
-		// Otherwise keep spinning, ignoring the wrap-up nudge entirely.
+	p := fake.New("fake", fake.WithDefault(func(_ string, _ llm.Request) fake.Step {
 		n++
-		return spinToolCall(n)
+		return spinToolCall(n) // spin forever, ignoring the wrap-up nudge
 	}))
 	mdl, err := p.Model("mock")
 	if err != nil {
@@ -97,37 +94,9 @@ func TestRunAgent_FinalizationFallback(t *testing.T) {
 	}
 	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")

-	out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6)
-	if err != nil {
-		t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err)
-	}
-	if !forcedCalled {
-		t.Error("finalization fallback was never invoked")
-	}
-	if out != final {
-		t.Errorf("expected forced final answer %q, got %q", final, out)
-	}
-}
-
-// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization
-// yields nothing, runAgent surfaces an error rather than a phantom success.
-func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) {
-	n := 0
-	p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step {
-		if req.ToolChoice == "none" {
-			return fake.Reply("   ") // finalization produces only whitespace
-		}
-		n++
-		return spinToolCall(n)
-	}))
-	mdl, err := p.Model("mock")
-	if err != nil {
-		t.Fatal(err)
-	}
-	fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n")
-
-	if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil {
-		t.Error("runAgent should error when the finalization fallback also yields no output")
+	rex := newTestReviewExecutor(t, mdl, fs)
+	if _, err := rex.run(context.Background(), "sys", "task", 6); err == nil {
+		t.Error("run should error when the model exhausts its steps without an answer")
 	}
 }