diff --git a/cmd/gadfly/executus.go b/cmd/gadfly/executus.go index 41078e5..3f0b04e 100644 --- a/cmd/gadfly/executus.go +++ b/cmd/gadfly/executus.go @@ -56,6 +56,11 @@ const ( // compactor defaults; surfaced as gadfly env knobs for tuning. defaultCompactKeepRecent = 8 defaultCompactSummaryWords = 200 + // contextTokenLookupTimeout bounds the one-shot /api/show call that resolves a + // cloud model's context window at executor-build time. Kept short so a slow or + // unreachable endpoint adds at most this to startup before degrading to + // no-compaction (rather than the provider cache's default 15s). + contextTokenLookupTimeout = 5 * time.Second ) // runSeq mints a unique-per-process RunID suffix for each executor run so audit @@ -79,10 +84,7 @@ func (w wrappedTool) BuildLLM(_ exectool.Invocation) llm.Tool { return w.t } // returns it along with the tool names for RunnableAgent.LowLevelTools. func gadflyToolRegistry(fs *repoFS) (exectool.Registry, []string, error) { reg := exectool.NewRegistry() - tools := fs.fsTools() - if fs.worker != nil { - tools = append(tools, fs.delegateTool()) - } + tools := fs.allTools() names := make([]string, 0, len(tools)) for _, t := range tools { if err := reg.Register(wrappedTool{t: t}); err != nil { @@ -100,6 +102,11 @@ func gadflyToolRegistry(fs *repoFS) (exectool.Registry, []string, error) { // the engine calls addUsage after each pass with run.Result.Usage. A nil // *gadflyBudget is never installed — caps of 0 mean "unlimited", so the port is // only wired when at least one cap is set. +// +// The guard is PASS-granular: Check runs before each pass, so it stops the NEXT +// pass once the budget is spent but cannot abort a single runaway pass mid-flight. +// The swarm-wide GADFLY_PR_BUDGET_SECS wall-clock backstop (entrypoint.sh) is what +// bounds a mid-pass runaway. type gadflyBudget struct { mu sync.Mutex maxTokens int64 @@ -254,8 +261,10 @@ func newReviewExecutor(fs *repoFS, mdl, summarizer llm.Model, modelSpec string, Compactor: compactor, ContextTokens: ctxTokens, Defaults: exrun.Defaults{ - MaxIterations: envInt("GADFLY_MAX_STEPS", defaultMaxSteps), - MaxRuntime: time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second, + // MaxIterations/MaxRuntime are intentionally omitted: every pass sets its + // own per-run cap on the RunnableAgent below (the review and recheck caps + // differ), so a Defaults value here would always be overridden — dead. This + // leaves only the cross-pass guards + the compaction ratio. MaxConsecutiveToolErrors: 4, MaxSameToolCallRepeats: 4, CompactionThresholdRatio: compactRatio(), @@ -282,7 +291,7 @@ func (r *reviewExecutor) run(ctx context.Context, system, task string, maxSteps SystemPrompt: system, ModelTier: r.modelSpec, MaxIterations: maxSteps, - MaxRuntime: time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second, + MaxRuntime: reviewTimeout(), LowLevelTools: r.toolNames, Critic: exrun.CriticConfig{Enabled: true}, }, exectool.Invocation{ @@ -316,14 +325,7 @@ func prCallerID() string { // compactionEnabled reports whether context compaction should be wired. On // unless GADFLY_COMPACT is explicitly falsey. -func compactionEnabled() bool { - switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_COMPACT"))) { - case "0", "false", "no", "off": - return false - default: - return true - } -} +func compactionEnabled() bool { return envBool("GADFLY_COMPACT", true) } // compactRatio is the compaction threshold as a fraction of the model context // window (GADFLY_COMPACT_RATIO), clamped to (0,1]; default defaultCompactRatio. @@ -355,10 +357,15 @@ func resolveContextTokens(modelSpec string) int { key = strings.TrimSpace(os.Getenv("OLLAMA_API_KEY")) } cache := model.NewCloudOllamaLimitCache("", key, nil) - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), contextTokenLookupTimeout) defer cancel() if n, ok := model.MaxContextTokensResolving(ctx, modelSpec, cache); ok { return n } + // Unknown model or a failed lookup (e.g. no key / unreachable endpoint): don't + // guess — compaction is disabled. Log it so a misconfiguration is debuggable + // rather than silently dropping the protection. Set GADFLY_MODEL_CONTEXT_TOKENS + // to force a window for an endpoint executus can't introspect. + fmt.Fprintf(os.Stderr, "gadfly: no context window resolved for %q; compaction disabled (set GADFLY_MODEL_CONTEXT_TOKENS to enable it)\n", modelSpec) return 0 } diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index 85b5b69..18e6dc5 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -39,10 +39,9 @@ // GADFLY_TITLE PR title (optional). // GADFLY_BODY PR description (optional). // GADFLY_MAX_STEPS review-pass step cap (optional, default 24). -// GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to -// stop investigating and write its answer (optional, -// default 4). Plus a tool-free finalization fallback -// guarantees a step-exhausted pass still emits output. +// GADFLY_WRAPUP_RESERVE steps before the cap at which the wrap-up critic nudges +// the agent to stop investigating and write its answer +// (optional, default 4). // GADFLY_RECHECK set to 0/false to skip the recheck pass (optional, default on). // GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16). // GADFLY_TIMEOUT_SECS overall deadline in seconds, shared by both passes (optional, default 300). @@ -55,8 +54,8 @@ // lanes as GADFLY_PROVIDER_CONCURRENCY (e.g. // "ollama-cloud=3,m1=1"). Wins over GADFLY_LENS_CONCURRENCY // for the model's provider; falls back to it otherwise. -// GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000; -// the full diff is always available via the get_diff tool). +// GADFLY_MAX_DIFF_CHARS diff chars embedded in the review prompt (optional, default 60000; +// the full diff is reachable via the paginated get_diff tool). // // On success it prints the review to stdout and exits 0. On a usage/config or // model error it prints a diagnostic to stderr and exits non-zero; run.sh then @@ -322,8 +321,7 @@ func providerOverride(envName, provider string) (int, bool) { // returned bool is true when the review pass failed (rendered as an inline // notice — advisory; one lens failing never sinks the others or the job). func reviewWithSpecialist(eng reviewEngine, base string, sp Specialist, task, diff string) (string, bool) { - timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) + ctx, cancel := context.WithTimeout(context.Background(), reviewTimeout()) defer cancel() draft, err := eng.runPass(ctx, composeSpecialistPrompt(base, sp), task, @@ -389,3 +387,23 @@ func envInt(name string, def int) int { } return n } + +// envBool reads a boolean-ish env var: def when unset, false for an explicit +// falsey value (0/false/no/off), true otherwise. The shared spelling for +// gadfly's "on unless disabled" opt-out flags (GADFLY_RECHECK, GADFLY_COMPACT). +func envBool(name string, def bool) bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv(name))) { + case "": + return def + case "0", "false", "no", "off": + return false + default: + return true + } +} + +// reviewTimeout is the per-specialist-lens deadline (GADFLY_TIMEOUT_SECS), shared +// across a lens's review+recheck passes and applied as each pass's run cap. +func reviewTimeout() time.Duration { + return time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second +} diff --git a/cmd/gadfly/recheck.go b/cmd/gadfly/recheck.go index e00bfa7..0e14539 100644 --- a/cmd/gadfly/recheck.go +++ b/cmd/gadfly/recheck.go @@ -2,7 +2,6 @@ package main import ( "fmt" - "os" "strings" ) @@ -65,14 +64,7 @@ Output rules: // recheckEnabled reports whether the verification pass should run. On unless // GADFLY_RECHECK is explicitly a falsey value. -func recheckEnabled() bool { - switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) { - case "0", "false", "no", "off": - return false - default: - return true - } -} +func recheckEnabled() bool { return envBool("GADFLY_RECHECK", true) } // shouldRecheck decides whether to run the verification pass for a given draft. // A clean "no material issues" draft has nothing to verify, so it is skipped diff --git a/cmd/gadfly/recheck_test.go b/cmd/gadfly/recheck_test.go index 75e8217..0ffe267 100644 --- a/cmd/gadfly/recheck_test.go +++ b/cmd/gadfly/recheck_test.go @@ -49,7 +49,7 @@ func TestRecheckEnabled(t *testing.T) { } func TestBuildRecheckTask(t *testing.T) { - t.Setenv("GADFLY_MAX_DIFF_CHARS", "") + t.Setenv("GADFLY_RECHECK_DIFF_CHARS", "") draft := "VERDICT: Blocking issues found\n- foo.go:1 broken" out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n") if !strings.Contains(out, draft) { diff --git a/cmd/gadfly/tools.go b/cmd/gadfly/tools.go index ece4183..3c1b21b 100644 --- a/cmd/gadfly/tools.go +++ b/cmd/gadfly/tools.go @@ -9,6 +9,7 @@ import ( "regexp" "sort" "strings" + "sync" llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) @@ -42,6 +43,12 @@ type repoFS struct { root string // absolute, symlink-resolved repo root diff string // the full PR unified diff (served by get_diff) worker llm.Model // optional cheap model for delegate_investigation; nil = no delegation + + // diffLines caches the split diff so paging through get_diff doesn't re-split + // the whole (possibly large) diff on every call. The diff is immutable, so the + // cache is computed once and safe to share across concurrent lenses. + diffOnce sync.Once + diffLines []string } // newRepoFS resolves root to an absolute, symlink-free path. @@ -110,15 +117,24 @@ func (r *repoFS) fsTools() []llm.Tool { } } -// toolbox builds the reviewer's toolbox: the read-only repo tools, plus the -// delegate_investigation tool when a worker model is configured. -func (r *repoFS) toolbox() (*llm.Toolbox, error) { - box := llm.NewToolbox("gadfly") +// allTools is the single source of truth for the reviewer's tool set: the +// read-only repo tools, plus delegate_investigation when a worker model is +// configured. Both the executus registry (gadflyToolRegistry, the production +// path) and toolbox() build from this list. +func (r *repoFS) allTools() []llm.Tool { tools := r.fsTools() if r.worker != nil { tools = append(tools, r.delegateTool()) } - for _, t := range tools { + return tools +} + +// toolbox builds a majordomo toolbox from allTools(). The production review path +// now goes through executus's tool.Registry (see executus.go); this remains for +// the toolbox-level tests (the `call` helper). +func (r *repoFS) toolbox() (*llm.Toolbox, error) { + box := llm.NewToolbox("gadfly") + for _, t := range r.allTools() { if err := box.Add(t); err != nil { return nil, fmt.Errorf("add tool %q: %w", t.Name, err) } @@ -409,30 +425,49 @@ func (r *repoFS) getDiffTool() llm.Tool { "get_diff", "Return the unified diff under review as a numbered, PAGINATED window (like read_file) — not the whole diff at once, so a huge PR can't blow the context window. Pass `path` to fetch just one changed file's hunks, or `start_line`/`limit` to page through. A truncated copy of the diff is also embedded in the task message.", func(_ context.Context, args getDiffArgs) (any, error) { - diff := r.diff scope := "the diff" + var lines []string if p := strings.TrimSpace(args.Path); p != "" { - diff = diffForPath(r.diff, p) - if strings.TrimSpace(diff) == "" { + lines = diffLinesForPath(r.diff, p) + if len(lines) == 0 { return fmt.Sprintf("(no diff hunks for path %q; check it against the changed-files list — the path must match a file the diff touches)", p), nil } scope = "the diff for " + p + } else { + lines = r.diffAllLines() + if len(lines) == 0 { + return "(empty diff)", nil + } } - if strings.TrimSpace(diff) == "" { - return "(empty diff)", nil - } - return windowDiff(diff, scope, args.StartLine, args.Limit), nil + return windowDiff(lines, scope, args.StartLine, args.Limit), nil }, ) } -// windowDiff returns a numbered, paginated slice of a unified diff, mirroring -// read_file's caps (maxGetDiffLines / maxGetDiffBytes / maxLineLen) so a single -// get_diff call can never dump a multi-hundred-KB diff into the transcript — the -// amplifier behind the large-PR token burn. The full diff stays reachable by -// paging with start_line, or scoped per file via the path arg. -func windowDiff(diff, scope string, start, limit int) string { +// diffAllLines returns the whole diff split into lines, cached so paging never +// re-splits the (possibly large) diff. +func (r *repoFS) diffAllLines() []string { + r.diffOnce.Do(func() { r.diffLines = splitDiffLines(r.diff) }) + return r.diffLines +} + +// splitDiffLines splits a unified diff into lines, dropping the single trailing +// empty element a trailing newline produces — otherwise windowDiff would emit a +// blank final line and over-count the total by one. +func splitDiffLines(diff string) []string { lines := strings.Split(diff, "\n") + if n := len(lines); n > 0 && lines[n-1] == "" { + lines = lines[:n-1] + } + return lines +} + +// windowDiff returns a numbered, paginated slice of pre-split diff lines, +// mirroring read_file's caps (maxGetDiffLines / maxGetDiffBytes / maxLineLen) so +// a single get_diff call can never dump a multi-hundred-KB diff into the +// transcript — the amplifier behind the large-PR token burn. The full diff stays +// reachable by paging with start_line, or scoped per file via the path arg. +func windowDiff(lines []string, scope string, start, limit int) string { total := len(lines) if start < 1 { start = 1 @@ -458,26 +493,45 @@ func windowDiff(diff, scope string, start, limit int) string { emitted++ } if i < total { - fmt.Fprintf(&b, "... (%s truncated at line %d of %d; call get_diff again with start_line=%d for more, or pass a `path` to scope to one file)\n", scope, i, total, i+1) + // i (0-based) is the first line NOT emitted; line i was the last shown. + fmt.Fprintf(&b, "... (%s truncated after line %d of %d; call get_diff again with start_line=%d for the rest, or pass a `path` to scope to one file)\n", scope, i, total, i+1) } return b.String() } -// diffForPath extracts the unified-diff section for one changed file: from the -// `diff --git` header that names the path through to the next `diff --git` -// header (or end). Matching is a substring on the header line, so either side -// (a/ or b/) hits, including a rename's two paths. -func diffForPath(diff, path string) string { - var b strings.Builder +// diffLinesForPath returns the unified-diff lines for one changed file: from the +// `diff --git` header that names path through to the next header (or end). The +// header names two path tokens (a/ b/); a match is on a WHOLE token, +// so path "foo.go" does not pull in "barfoo.go" — and a trailing "/" scopes to a +// directory (e.g. "pkg/foo/" matches pkg/foo/bar.go). +func diffLinesForPath(diff, path string) []string { + want := strings.TrimPrefix(strings.TrimPrefix(strings.TrimSpace(path), "a/"), "b/") + var out []string inSection := false - for _, ln := range strings.Split(diff, "\n") { + for _, ln := range splitDiffLines(diff) { if strings.HasPrefix(ln, "diff --git ") { - inSection = strings.Contains(ln, path) + inSection = diffHeaderNames(ln, want) } if inSection { - b.WriteString(ln) - b.WriteByte('\n') + out = append(out, ln) } } - return b.String() + return out +} + +// diffHeaderNames reports whether a `diff --git a/X b/Y` header names want as one +// of its (a/b-stripped) path tokens — exact whole-token match, or a directory +// prefix when want ends with "/". +func diffHeaderNames(header, want string) bool { + fields := strings.Fields(header) + if len(fields) < 3 { + return false + } + for _, f := range fields[2:] { + p := strings.TrimPrefix(strings.TrimPrefix(f, "a/"), "b/") + if p == want || (strings.HasSuffix(want, "/") && strings.HasPrefix(p, want)) { + return true + } + } + return false } diff --git a/cmd/gadfly/tools_test.go b/cmd/gadfly/tools_test.go index 10dc644..4704162 100644 --- a/cmd/gadfly/tools_test.go +++ b/cmd/gadfly/tools_test.go @@ -247,7 +247,7 @@ func TestGetDiffTool_Paginates(t *testing.T) { if err != nil { t.Fatalf("get_diff: %v", err) } - if !strings.Contains(out, "truncated at line") { + if !strings.Contains(out, "truncated after line") { t.Error("a diff longer than the per-call cap should be truncated with a paging hint") } if strings.Contains(out, "801\t") { @@ -263,6 +263,30 @@ func TestGetDiffTool_Paginates(t *testing.T) { } } +// TestDiffLinesForPath_Anchored: get_diff path= matches on a WHOLE path token, +// so "foo.go" never pulls in "barfoo.go" (the unanchored-substring weakness the +// swarm flagged), while a trailing "/" still scopes to a directory. +func TestDiffLinesForPath_Anchored(t *testing.T) { + diff := "diff --git a/foo.go b/foo.go\n+foo change\n" + + "diff --git a/barfoo.go b/barfoo.go\n+barfoo change\n" + + "diff --git a/pkg/x.go b/pkg/x.go\n+x change\n" + + joined := strings.Join(diffLinesForPath(diff, "foo.go"), "\n") + if !strings.Contains(joined, "foo change") { + t.Errorf("foo.go should match its own hunk:\n%s", joined) + } + if strings.Contains(joined, "barfoo change") { + t.Errorf("foo.go must NOT match barfoo.go (unanchored substring regression):\n%s", joined) + } + + if !strings.Contains(strings.Join(diffLinesForPath(diff, "pkg/"), "\n"), "x change") { + t.Error("a trailing-slash path should scope to the directory (pkg/ -> pkg/x.go)") + } + if len(diffLinesForPath(diff, "nope.go")) != 0 { + t.Error("an unknown path should yield no lines") + } +} + func TestNewRepoFS_BadRoot(t *testing.T) { // A file (not a directory) is rejected. f := filepath.Join(t.TempDir(), "afile") diff --git a/entrypoint.sh b/entrypoint.sh index 35c2453..7927022 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -205,7 +205,9 @@ if [ "$HUGE_DIFF_BYTES" -gt 0 ] 2>/dev/null; then export GADFLY_MAX_STEPS="${GADFLY_HUGE_DIFF_MAX_STEPS:-12}" export GADFLY_RECHECK_MAX_STEPS="${GADFLY_HUGE_DIFF_RECHECK_MAX_STEPS:-8}" export GADFLY_RECHECK="${GADFLY_HUGE_DIFF_RECHECK:-0}" # skip recheck on huge PRs - export MAX_DIFF_CHARS="${GADFLY_HUGE_DIFF_MAX_DIFF_CHARS:-20000}" # run.sh -> GADFLY_MAX_DIFF_CHARS + # The Go-visible name directly (run.sh prefers GADFLY_MAX_DIFF_CHARS over its + # own MAX_DIFF_CHARS), so the cap is honored without relying on run.sh's alias. + export GADFLY_MAX_DIFF_CHARS="${GADFLY_HUGE_DIFF_MAX_DIFF_CHARS:-20000}" # Surfaced on each posted comment so the shallower review is self-explaining. export GADFLY_NOTICE="⚠️ Large PR (${PR_DIFF_BYTES} bytes): Gadfly downshifted to a focused, single-model review to stay within budget — coverage is intentionally shallower. Consider splitting the PR for a deeper review." fi @@ -333,7 +335,7 @@ fi # findings were gathered are still posted and the job never fails (advisory). # GADFLY_PR_BUDGET_SECS=0 (default) disables it. KILLER_PID="" -rm -f "${WORKDIR}/.budget_killed" 2>/dev/null || true +rm -f "${WORKDIR}/.budget_killed" "${WORKDIR}/.disarmed" 2>/dev/null || true if [ "${GADFLY_PR_BUDGET_SECS:-0}" -gt 0 ] 2>/dev/null; then ( sleep "${GADFLY_PR_BUDGET_SECS}" @@ -342,7 +344,10 @@ if [ "${GADFLY_PR_BUDGET_SECS:-0}" -gt 0 ] 2>/dev/null; then pkill -TERM -f '/usr/local/bin/gadfly' 2>/dev/null || true pkill -TERM -f "${SCRIPTS_DIR}/run.sh" 2>/dev/null || true sleep 5 - pkill -KILL -f '/usr/local/bin/gadfly' 2>/dev/null || true + # Guard the delayed SIGKILL on the disarm marker: once the lanes finished and + # the watchdog was disarmed, the consolidation gadfly pass runs next, and a + # name-based KILL here must NOT catch it. + [ -f "${WORKDIR}/.disarmed" ] || pkill -KILL -f '/usr/local/bin/gadfly' 2>/dev/null || true ) & KILLER_PID=$! log "PR budget watchdog armed (${GADFLY_PR_BUDGET_SECS}s, pid ${KILLER_PID})" @@ -360,8 +365,13 @@ done [ "${#LANE_PIDS[@]}" -gt 0 ] && wait "${LANE_PIDS[@]}" # Reviews finished (or the watchdog killed them): disarm the watchdog so its -# delayed SIGKILL can't catch the consolidation pass that runs next. -if [ -n "$KILLER_PID" ]; then kill "$KILLER_PID" 2>/dev/null || true; fi +# delayed SIGKILL can't catch the consolidation pass that runs next. Drop the +# disarm marker FIRST so even a racing watchdog that already reached its KILL line +# skips it (the kill below also tears the watchdog subshell down during its sleep). +if [ -n "$KILLER_PID" ]; then + : > "${WORKDIR}/.disarmed" + kill "$KILLER_PID" 2>/dev/null || true +fi # If the backstop fired, note it on the consensus comment (per-model comments # were already posted during the run; a killed model surfaces as a failed lane). diff --git a/scripts/run.sh b/scripts/run.sh index 6e0a686..1731e6a 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -177,7 +177,7 @@ case "$PROVIDER" in GADFLY_SYSTEM_FILE="${SCRIPT_DIR}/system-prompt.txt" \ GADFLY_TITLE="$TITLE" \ GADFLY_BODY="$BODY" \ - GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \ + GADFLY_MAX_DIFF_CHARS="${GADFLY_MAX_DIFF_CHARS:-$MAX_DIFF_CHARS}" \ GADFLY_STATUS_FILE="${GADFLY_STATUS_FILE:-}" \ GADFLY_FINDINGS_OUT="${GADFLY_FINDINGS_OUT:-}" \ "$BIN" 2>"$ERR_FILE"