2026-06-30 15:41:03 +00:00 · 2026-06-30 15:10:28 +00:00 · 2026-06-30 15:10:28 +00:00 · 2026-06-30 15:10:28 +00:00 · 2026-06-30 15:10:28 +00:00 · 2026-06-30 15:10:28 +00:00
8 changed files with 176 additions and 71 deletions
1
				
			
			
				
			
			
			
				@@ -56,6 +56,11 @@ const (
 	// compactor defaults; surfaced as gadfly env knobs for tuning.
 	defaultCompactKeepRecent   = 8
 	defaultCompactSummaryWords = 200
+	// contextTokenLookupTimeout bounds the one-shot /api/show call that resolves a
+	// cloud model's context window at executor-build time. Kept short so a slow or
+	// unreachable endpoint adds at most this to startup before degrading to
+	// no-compaction (rather than the provider cache's default 15s).
+	contextTokenLookupTimeout = 5 * time.Second
 )

 // runSeq mints a unique-per-process RunID suffix for each executor run so audit
1
				
			
			
				
			
			
			
				@@ -79,10 +84,7 @@ func (w wrappedTool) BuildLLM(_ exectool.Invocation) llm.Tool { return w.t }
 // returns it along with the tool names for RunnableAgent.LowLevelTools.
 func gadflyToolRegistry(fs *repoFS) (exectool.Registry, []string, error) {
 	reg := exectool.NewRegistry()
-	tools := fs.fsTools()
-	if fs.worker != nil {
-		tools = append(tools, fs.delegateTool())
-	}
+	tools := fs.allTools()
 	names := make([]string, 0, len(tools))
 	for _, t := range tools {
 		if err := reg.Register(wrappedTool{t: t}); err != nil {
@@ -100,6 +102,11 @@ func gadflyToolRegistry(fs *repoFS) (exectool.Registry, []string, error) {
 // the engine calls addUsage after each pass with run.Result.Usage. A nil
 // *gadflyBudget is never installed — caps of 0 mean "unlimited", so the port is
 // only wired when at least one cap is set.
+//
+// The guard is PASS-granular: Check runs before each pass, so it stops the NEXT
+// pass once the budget is spent but cannot abort a single runaway pass mid-flight.
+// The swarm-wide GADFLY_PR_BUDGET_SECS wall-clock backstop (entrypoint.sh) is what
+// bounds a mid-pass runaway.
 type gadflyBudget struct {
 	mu         sync.Mutex
 	maxTokens  int64
1
				
			
			
				
			
			
			
				@@ -254,8 +261,10 @@ func newReviewExecutor(fs *repoFS, mdl, summarizer llm.Model, modelSpec string,
 		Compactor:     compactor,
 		ContextTokens: ctxTokens,
 		Defaults: exrun.Defaults{
-			MaxIterations:            envInt("GADFLY_MAX_STEPS", defaultMaxSteps),
-			MaxRuntime:               time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second,
+			// MaxIterations/MaxRuntime are intentionally omitted: every pass sets its
+			// own per-run cap on the RunnableAgent below (the review and recheck caps
+			// differ), so a Defaults value here would always be overridden — dead. This
+			// leaves only the cross-pass guards + the compaction ratio.
 			MaxConsecutiveToolErrors: 4,
 			MaxSameToolCallRepeats:   4,
 			CompactionThresholdRatio: compactRatio(),
1
				
			
			
				
			
			
			
				@@ -282,7 +291,7 @@ func (r *reviewExecutor) run(ctx context.Context, system, task string, maxSteps
 		SystemPrompt:  system,
 		ModelTier:     r.modelSpec,
 		MaxIterations: maxSteps,
-		MaxRuntime:    time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second,
+		MaxRuntime:    reviewTimeout(),
 		LowLevelTools: r.toolNames,
 		Critic:        exrun.CriticConfig{Enabled: true},
 	}, exectool.Invocation{
1
				
			
			
				
			
			
			
				@@ -316,14 +325,7 @@ func prCallerID() string {

 // compactionEnabled reports whether context compaction should be wired. On
 // unless GADFLY_COMPACT is explicitly falsey.
-func compactionEnabled() bool {
-	switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_COMPACT"))) {
-	case "0", "false", "no", "off":
-		return false
-	default:
-		return true
-	}
-}
+func compactionEnabled() bool { return envBool("GADFLY_COMPACT", true) }

 // compactRatio is the compaction threshold as a fraction of the model context
 // window (GADFLY_COMPACT_RATIO), clamped to (0,1]; default defaultCompactRatio.
@@ -355,10 +357,15 @@ func resolveContextTokens(modelSpec string) int {
 		key = strings.TrimSpace(os.Getenv("OLLAMA_API_KEY"))
 	}
 	cache := model.NewCloudOllamaLimitCache("", key, nil)
-	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), contextTokenLookupTimeout)
 	defer cancel()
 	if n, ok := model.MaxContextTokensResolving(ctx, modelSpec, cache); ok {
 		return n
 	}
+	// Unknown model or a failed lookup (e.g. no key / unreachable endpoint): don't
+	// guess — compaction is disabled. Log it so a misconfiguration is debuggable
+	// rather than silently dropping the protection. Set GADFLY_MODEL_CONTEXT_TOKENS
+	// to force a window for an endpoint executus can't introspect.
+	fmt.Fprintf(os.Stderr, "gadfly: no context window resolved for %q; compaction disabled (set GADFLY_MODEL_CONTEXT_TOKENS to enable it)\n", modelSpec)
 	return 0
 }
@@ -39,10 +39,9 @@
 //	GADFLY_TITLE          PR title (optional).
 //	GADFLY_BODY           PR description (optional).
 //	GADFLY_MAX_STEPS      review-pass step cap (optional, default 24).
-//	GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to
-//	                        stop investigating and write its answer (optional,
-//	                        default 4). Plus a tool-free finalization fallback
-//	                        guarantees a step-exhausted pass still emits output.
+//	GADFLY_WRAPUP_RESERVE steps before the cap at which the wrap-up critic nudges
+//	                        the agent to stop investigating and write its answer
+//	                        (optional, default 4).
 //	GADFLY_RECHECK        set to 0/false to skip the recheck pass (optional, default on).
 //	GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16).
 //	GADFLY_TIMEOUT_SECS   overall deadline in seconds, shared by both passes (optional, default 300).
@@ -55,8 +54,8 @@
 //	                        lanes as GADFLY_PROVIDER_CONCURRENCY (e.g.
 //	                        "ollama-cloud=3,m1=1"). Wins over GADFLY_LENS_CONCURRENCY
 //	                        for the model's provider; falls back to it otherwise.
-//	GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000;
-//	                        the full diff is always available via the get_diff tool).
+//	GADFLY_MAX_DIFF_CHARS diff chars embedded in the review prompt (optional, default 60000;
+//	                        the full diff is reachable via the paginated get_diff tool).
 //
 // On success it prints the review to stdout and exits 0. On a usage/config or
 // model error it prints a diagnostic to stderr and exits non-zero; run.sh then
@@ -322,8 +321,7 @@ func providerOverride(envName, provider string) (int, bool) {
 // returned bool is true when the review pass failed (rendered as an inline
 // notice — advisory; one lens failing never sinks the others or the job).
 func reviewWithSpecialist(eng reviewEngine, base string, sp Specialist, task, diff string) (string, bool) {
-	timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	ctx, cancel := context.WithTimeout(context.Background(), reviewTimeout())
 	defer cancel()

 	draft, err := eng.runPass(ctx, composeSpecialistPrompt(base, sp), task,
@@ -389,3 +387,23 @@ func envInt(name string, def int) int {
 	}
 	return n
 }
+
+// envBool reads a boolean-ish env var: def when unset, false for an explicit
+// falsey value (0/false/no/off), true otherwise. The shared spelling for
+// gadfly's "on unless disabled" opt-out flags (GADFLY_RECHECK, GADFLY_COMPACT).
+func envBool(name string, def bool) bool {
+	switch strings.ToLower(strings.TrimSpace(os.Getenv(name))) {
+	case "":
+		return def
+	case "0", "false", "no", "off":
+		return false
+	default:
+		return true
+	}
+}
+
+// reviewTimeout is the per-specialist-lens deadline (GADFLY_TIMEOUT_SECS), shared
+// across a lens's review+recheck passes and applied as each pass's run cap.
+func reviewTimeout() time.Duration {
+	return time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
+}
@@ -2,7 +2,6 @@ package main

 import (
 	"fmt"
-	"os"
 	"strings"
 )

@@ -65,14 +64,7 @@ Output rules:

 // recheckEnabled reports whether the verification pass should run. On unless
 // GADFLY_RECHECK is explicitly a falsey value.
-func recheckEnabled() bool {
-	switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) {
-	case "0", "false", "no", "off":
-		return false
-	default:
-		return true
-	}
-}
+func recheckEnabled() bool { return envBool("GADFLY_RECHECK", true) }

 // shouldRecheck decides whether to run the verification pass for a given draft.
 // A clean "no material issues" draft has nothing to verify, so it is skipped
@@ -49,7 +49,7 @@ func TestRecheckEnabled(t *testing.T) {
 }

 func TestBuildRecheckTask(t *testing.T) {
-	t.Setenv("GADFLY_MAX_DIFF_CHARS", "")
+	t.Setenv("GADFLY_RECHECK_DIFF_CHARS", "")
 	draft := "VERDICT: Blocking issues found\n- foo.go:1 broken"
 	out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n")
 	if !strings.Contains(out, draft) {
@@ -9,6 +9,7 @@ import (
 	"regexp"
 	"sort"
 	"strings"
+	"sync"

 	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
 )
@@ -42,6 +43,12 @@ type repoFS struct {
 	root   string    // absolute, symlink-resolved repo root
 	diff   string    // the full PR unified diff (served by get_diff)
 	worker llm.Model // optional cheap model for delegate_investigation; nil = no delegation
+
+	// diffLines caches the split diff so paging through get_diff doesn't re-split
+	// the whole (possibly large) diff on every call. The diff is immutable, so the
+	// cache is computed once and safe to share across concurrent lenses.
+	diffOnce  sync.Once
+	diffLines []string
 }

 // newRepoFS resolves root to an absolute, symlink-free path.
@@ -110,15 +117,24 @@ func (r *repoFS) fsTools() []llm.Tool {
 	}
 }

-// toolbox builds the reviewer's toolbox: the read-only repo tools, plus the
-// delegate_investigation tool when a worker model is configured.
-func (r *repoFS) toolbox() (*llm.Toolbox, error) {
-	box := llm.NewToolbox("gadfly")
+// allTools is the single source of truth for the reviewer's tool set: the
+// read-only repo tools, plus delegate_investigation when a worker model is
+// configured. Both the executus registry (gadflyToolRegistry, the production
+// path) and toolbox() build from this list.
+func (r *repoFS) allTools() []llm.Tool {
 	tools := r.fsTools()
 	if r.worker != nil {
 		tools = append(tools, r.delegateTool())
 	}
-	for _, t := range tools {
+	return tools
+}
+
+// toolbox builds a majordomo toolbox from allTools(). The production review path
+// now goes through executus's tool.Registry (see executus.go); this remains for
+// the toolbox-level tests (the `call` helper).
+func (r *repoFS) toolbox() (*llm.Toolbox, error) {
+	box := llm.NewToolbox("gadfly")
+	for _, t := range r.allTools() {
 		if err := box.Add(t); err != nil {
 			return nil, fmt.Errorf("add tool %q: %w", t.Name, err)
 		}
@@ -409,30 +425,49 @@ func (r *repoFS) getDiffTool() llm.Tool {
 		"get_diff",
 		"Return the unified diff under review as a numbered, PAGINATED window (like read_file) — not the whole diff at once, so a huge PR can't blow the context window. Pass `path` to fetch just one changed file's hunks, or `start_line`/`limit` to page through. A truncated copy of the diff is also embedded in the task message.",
 		func(_ context.Context, args getDiffArgs) (any, error) {
-			diff := r.diff
 			scope := "the diff"
+			var lines []string
 			if p := strings.TrimSpace(args.Path); p != "" {
-				diff = diffForPath(r.diff, p)
-				if strings.TrimSpace(diff) == "" {
+				lines = diffLinesForPath(r.diff, p)
+				if len(lines) == 0 {
 					return fmt.Sprintf("(no diff hunks for path %q; check it against the changed-files list — the path must match a file the diff touches)", p), nil
 				}
 				scope = "the diff for " + p
+			} else {
+				lines = r.diffAllLines()
+				if len(lines) == 0 {
+					return "(empty diff)", nil
+				}
 			}
-			if strings.TrimSpace(diff) == "" {
-				return "(empty diff)", nil
-			}
-			return windowDiff(diff, scope, args.StartLine, args.Limit), nil
+			return windowDiff(lines, scope, args.StartLine, args.Limit), nil
 		},
 	)
 }

-// windowDiff returns a numbered, paginated slice of a unified diff, mirroring
-// read_file's caps (maxGetDiffLines / maxGetDiffBytes / maxLineLen) so a single
-// get_diff call can never dump a multi-hundred-KB diff into the transcript — the
-// amplifier behind the large-PR token burn. The full diff stays reachable by
-// paging with start_line, or scoped per file via the path arg.
-func windowDiff(diff, scope string, start, limit int) string {
+// diffAllLines returns the whole diff split into lines, cached so paging never
+// re-splits the (possibly large) diff.
+func (r *repoFS) diffAllLines() []string {
+	r.diffOnce.Do(func() { r.diffLines = splitDiffLines(r.diff) })
+	return r.diffLines
+}
+
+// splitDiffLines splits a unified diff into lines, dropping the single trailing
+// empty element a trailing newline produces — otherwise windowDiff would emit a
+// blank final line and over-count the total by one.
+func splitDiffLines(diff string) []string {
 	lines := strings.Split(diff, "\n")
+	if n := len(lines); n > 0 && lines[n-1] == "" {
+		lines = lines[:n-1]
+	}
+	return lines
+}
+
+// windowDiff returns a numbered, paginated slice of pre-split diff lines,
+// mirroring read_file's caps (maxGetDiffLines / maxGetDiffBytes / maxLineLen) so
+// a single get_diff call can never dump a multi-hundred-KB diff into the
+// transcript — the amplifier behind the large-PR token burn. The full diff stays
+// reachable by paging with start_line, or scoped per file via the path arg.
+func windowDiff(lines []string, scope string, start, limit int) string {
 	total := len(lines)
 	if start < 1 {
 		start = 1
@@ -458,26 +493,45 @@ func windowDiff(diff, scope string, start, limit int) string {
 		emitted++
 	}
 	if i < total {
-		fmt.Fprintf(&b, "... (%s truncated at line %d of %d; call get_diff again with start_line=%d for more, or pass a `path` to scope to one file)\n", scope, i, total, i+1)
+		// i (0-based) is the first line NOT emitted; line i was the last shown.
+		fmt.Fprintf(&b, "... (%s truncated after line %d of %d; call get_diff again with start_line=%d for the rest, or pass a `path` to scope to one file)\n", scope, i, total, i+1)
 	}
 	return b.String()
 }

-// diffForPath extracts the unified-diff section for one changed file: from the
-// `diff --git` header that names the path through to the next `diff --git`
-// header (or end). Matching is a substring on the header line, so either side
-// (a/<path> or b/<path>) hits, including a rename's two paths.
-func diffForPath(diff, path string) string {
-	var b strings.Builder
+// diffLinesForPath returns the unified-diff lines for one changed file: from the
+// `diff --git` header that names path through to the next header (or end). The
+// header names two path tokens (a/<old> b/<new>); a match is on a WHOLE token,
+// so path "foo.go" does not pull in "barfoo.go" — and a trailing "/" scopes to a
+// directory (e.g. "pkg/foo/" matches pkg/foo/bar.go).
+func diffLinesForPath(diff, path string) []string {
+	want := strings.TrimPrefix(strings.TrimPrefix(strings.TrimSpace(path), "a/"), "b/")
+	var out []string
 	inSection := false
-	for _, ln := range strings.Split(diff, "\n") {
+	for _, ln := range splitDiffLines(diff) {
 		if strings.HasPrefix(ln, "diff --git ") {
-			inSection = strings.Contains(ln, path)
+			inSection = diffHeaderNames(ln, want)
 		}
 		if inSection {
-			b.WriteString(ln)
-			b.WriteByte('\n')
+			out = append(out, ln)
 		}
 	}
-	return b.String()
+	return out
+}
+
+// diffHeaderNames reports whether a `diff --git a/X b/Y` header names want as one
+// of its (a/b-stripped) path tokens — exact whole-token match, or a directory
+// prefix when want ends with "/".
+func diffHeaderNames(header, want string) bool {
+	fields := strings.Fields(header)
+	if len(fields) < 3 {
+		return false
+	}
+	for _, f := range fields[2:] {
+		p := strings.TrimPrefix(strings.TrimPrefix(f, "a/"), "b/")
+		if p == want || (strings.HasSuffix(want, "/") && strings.HasPrefix(p, want)) {
+			return true
+		}
+	}
+	return false
 }
@@ -247,7 +247,7 @@ func TestGetDiffTool_Paginates(t *testing.T) {
 	if err != nil {
 		t.Fatalf("get_diff: %v", err)
 	}
-	if !strings.Contains(out, "truncated at line") {
+	if !strings.Contains(out, "truncated after line") {
 		t.Error("a diff longer than the per-call cap should be truncated with a paging hint")
 	}
 	if strings.Contains(out, "801\t") {
@@ -263,6 +263,30 @@ func TestGetDiffTool_Paginates(t *testing.T) {
 	}
 }

+// TestDiffLinesForPath_Anchored: get_diff path= matches on a WHOLE path token,
+// so "foo.go" never pulls in "barfoo.go" (the unanchored-substring weakness the
+// swarm flagged), while a trailing "/" still scopes to a directory.
+func TestDiffLinesForPath_Anchored(t *testing.T) {
+	diff := "diff --git a/foo.go b/foo.go\n+foo change\n" +
+		"diff --git a/barfoo.go b/barfoo.go\n+barfoo change\n" +
+		"diff --git a/pkg/x.go b/pkg/x.go\n+x change\n"
+
+	joined := strings.Join(diffLinesForPath(diff, "foo.go"), "\n")
+	if !strings.Contains(joined, "foo change") {
+		t.Errorf("foo.go should match its own hunk:\n%s", joined)
+	}
+	if strings.Contains(joined, "barfoo change") {
+		t.Errorf("foo.go must NOT match barfoo.go (unanchored substring regression):\n%s", joined)
+	}
+
+	if !strings.Contains(strings.Join(diffLinesForPath(diff, "pkg/"), "\n"), "x change") {
+		t.Error("a trailing-slash path should scope to the directory (pkg/ -> pkg/x.go)")
+	}
+	if len(diffLinesForPath(diff, "nope.go")) != 0 {
+		t.Error("an unknown path should yield no lines")
+	}
+}
+
 func TestNewRepoFS_BadRoot(t *testing.T) {
 	// A file (not a directory) is rejected.
 	f := filepath.Join(t.TempDir(), "afile")
@@ -205,7 +205,9 @@ if [ "$HUGE_DIFF_BYTES" -gt 0 ] 2>/dev/null; then
    export GADFLY_MAX_STEPS="${GADFLY_HUGE_DIFF_MAX_STEPS:-12}"
    export GADFLY_RECHECK_MAX_STEPS="${GADFLY_HUGE_DIFF_RECHECK_MAX_STEPS:-8}"
    export GADFLY_RECHECK="${GADFLY_HUGE_DIFF_RECHECK:-0}"               # skip recheck on huge PRs
-    export MAX_DIFF_CHARS="${GADFLY_HUGE_DIFF_MAX_DIFF_CHARS:-20000}"    # run.sh -> GADFLY_MAX_DIFF_CHARS
+    # The Go-visible name directly (run.sh prefers GADFLY_MAX_DIFF_CHARS over its
+    # own MAX_DIFF_CHARS), so the cap is honored without relying on run.sh's alias.
+    export GADFLY_MAX_DIFF_CHARS="${GADFLY_HUGE_DIFF_MAX_DIFF_CHARS:-20000}"
    # Surfaced on each posted comment so the shallower review is self-explaining.
    export GADFLY_NOTICE="⚠️ Large PR (${PR_DIFF_BYTES} bytes): Gadfly downshifted to a focused, single-model review to stay within budget — coverage is intentionally shallower. Consider splitting the PR for a deeper review."
  fi
@@ -333,7 +335,7 @@ fi
 # findings were gathered are still posted and the job never fails (advisory).
 # GADFLY_PR_BUDGET_SECS=0 (default) disables it.
 KILLER_PID=""
-rm -f "${WORKDIR}/.budget_killed" 2>/dev/null || true
+rm -f "${WORKDIR}/.budget_killed" "${WORKDIR}/.disarmed" 2>/dev/null || true
 if [ "${GADFLY_PR_BUDGET_SECS:-0}" -gt 0 ] 2>/dev/null; then
  (
    sleep "${GADFLY_PR_BUDGET_SECS}"
@@ -342,7 +344,10 @@ if [ "${GADFLY_PR_BUDGET_SECS:-0}" -gt 0 ] 2>/dev/null; then
    pkill -TERM -f '/usr/local/bin/gadfly' 2>/dev/null || true
    pkill -TERM -f "${SCRIPTS_DIR}/run.sh" 2>/dev/null || true
    sleep 5
-    pkill -KILL -f '/usr/local/bin/gadfly' 2>/dev/null || true
+    # Guard the delayed SIGKILL on the disarm marker: once the lanes finished and
+    # the watchdog was disarmed, the consolidation gadfly pass runs next, and a
+    # name-based KILL here must NOT catch it.
+    [ -f "${WORKDIR}/.disarmed" ] || pkill -KILL -f '/usr/local/bin/gadfly' 2>/dev/null || true
  ) &
  KILLER_PID=$!
  log "PR budget watchdog armed (${GADFLY_PR_BUDGET_SECS}s, pid ${KILLER_PID})"
1
				
			
			
				
			
			
			
				@@ -360,8 +365,13 @@ done
 [ "${#LANE_PIDS[@]}" -gt 0 ] && wait "${LANE_PIDS[@]}"

 # Reviews finished (or the watchdog killed them): disarm the watchdog so its
-# delayed SIGKILL can't catch the consolidation pass that runs next.
-if [ -n "$KILLER_PID" ]; then kill "$KILLER_PID" 2>/dev/null || true; fi
+# delayed SIGKILL can't catch the consolidation pass that runs next. Drop the
+# disarm marker FIRST so even a racing watchdog that already reached its KILL line
+# skips it (the kill below also tears the watchdog subshell down during its sleep).
+if [ -n "$KILLER_PID" ]; then
+  : > "${WORKDIR}/.disarmed"
+  kill "$KILLER_PID" 2>/dev/null || true
+fi

 # If the backstop fired, note it on the consensus comment (per-model comments
 # were already posted during the run; a killed model surfaces as a failed lane).
@@ -177,7 +177,7 @@ case "$PROVIDER" in
          GADFLY_SYSTEM_FILE="${SCRIPT_DIR}/system-prompt.txt" \
          GADFLY_TITLE="$TITLE" \
          GADFLY_BODY="$BODY" \
-          GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \
+          GADFLY_MAX_DIFF_CHARS="${GADFLY_MAX_DIFF_CHARS:-$MAX_DIFF_CHARS}" \
          GADFLY_STATUS_FILE="${GADFLY_STATUS_FILE:-}" \
          GADFLY_FINDINGS_OUT="${GADFLY_FINDINGS_OUT:-}" \
          "$BIN" 2>"$ERR_FILE"