feat: re-platform agentic review onto executus + large-PR cost controls

Fixes the large-PR token burn: a ~250K-token diff was re-sent every agent step across models × lenses × passes, draining a metered usage block in minutes. Small PRs are untouched (every mitigation is size-gated / no-op under threshold). - Re-platform the in-process review path onto executus run.Executor: context compaction (executus/compact, threshold from the model's real context window via executus/model), run-bounding, a per-PR budget gate (Ports.Budget), and the wrap-up nudge re-expressed as a run.Critic. Lens fan-out now uses executus/fanout. gadfly keeps its own model.go, so GADFLY_ENDPOINT_<NAME> aliases and the claude-code engine are unaffected. No majordomo bump; the binary stays static (executus core is majordomo+stdlib only). - Paginate get_diff (per-file `path` + start_line/limit) instead of dumping the whole diff; trim the recheck diff embed (60k -> 20k chars). - entrypoint.sh: downshift the fleet above GADFLY_HUGE_DIFF_BYTES (one cheap model, fewer lenses/steps, no recheck) + a swarm-wide GADFLY_PR_BUDGET_SECS wall-clock backstop (adds procps for pkill). All advisory; CI never fails. - README + CLAUDE.md + tests updated. Note: run.Result exposes no transcript, so the old transcript-based forced- finalization fallback is dropped; the wrap-up critic nudge is the remaining "always emit something" mechanism. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 10:43:34 -04:00
parent 5007597cf9
commit b860119332
19 changed files with 935 additions and 224 deletions
@@ -183,6 +183,34 @@ export GADFLY_FINDINGS_TOKEN="${GADFLY_FINDINGS_TOKEN:-}"
 MODELS="${GADFLY_MODELS:-${OLLAMA_REVIEW_MODELS:-$DEFAULT_MODELS}}"
 DEFAULT_CONC="${GADFLY_CONCURRENCY:-1}"

+# --- huge-PR downshift ------------------------------------------------------
+# A very large diff is what burns the model budget: every review step re-sends
+# it, multiplied across models × lenses × passes × steps (this is what nuked a
+# whole Ollama Cloud block on one giant PR). entrypoint is the only process that
+# spans the whole fleet, so the fleet-wide size decision lives here: size the PR
+# diff ONCE, and above GADFLY_HUGE_DIFF_BYTES collapse to a single cheap model +
+# a focused lens subset, fewer steps, no recheck, and a smaller embedded diff.
+# A finished shallow review beats a budget-nuking one. All knobs override; set
+# GADFLY_HUGE_DIFF_BYTES=0 to disable. Small PRs are never touched.
+HUGE_PR=0
+HUGE_DIFF_BYTES="${GADFLY_HUGE_DIFF_BYTES:-600000}"
+if [ "$HUGE_DIFF_BYTES" -gt 0 ] 2>/dev/null; then
+  PR_DIFF_BYTES="$(API "${GITEA_API}/pulls/${PR}.diff" 2>/dev/null | wc -c | tr -d '[:space:]')"
+  [ -z "$PR_DIFF_BYTES" ] && PR_DIFF_BYTES=0
+  if [ "$PR_DIFF_BYTES" -gt "$HUGE_DIFF_BYTES" ] 2>/dev/null; then
+    HUGE_PR=1
+    log "huge PR: diff ${PR_DIFF_BYTES}B > ${HUGE_DIFF_BYTES}B — downshifting the fleet (advisory)"
+    MODELS="${GADFLY_HUGE_DIFF_MODELS:-${MODELS%%,*}}" # first model only by default
+    export GADFLY_SPECIALISTS="${GADFLY_HUGE_DIFF_SPECIALISTS:-security,correctness,error-handling}"
+    export GADFLY_MAX_STEPS="${GADFLY_HUGE_DIFF_MAX_STEPS:-12}"
+    export GADFLY_RECHECK_MAX_STEPS="${GADFLY_HUGE_DIFF_RECHECK_MAX_STEPS:-8}"
+    export GADFLY_RECHECK="${GADFLY_HUGE_DIFF_RECHECK:-0}"               # skip recheck on huge PRs
+    export MAX_DIFF_CHARS="${GADFLY_HUGE_DIFF_MAX_DIFF_CHARS:-20000}"    # run.sh -> GADFLY_MAX_DIFF_CHARS
+    # Surfaced on each posted comment so the shallower review is self-explaining.
+    export GADFLY_NOTICE="⚠️ Large PR (${PR_DIFF_BYTES} bytes): Gadfly downshifted to a focused, single-model review to stay within budget — coverage is intentionally shallower. Consider splitting the PR for a deeper review."
+  fi
+fi
+
 provider_of() { case "$1" in */*) echo "${1%%/*}";; *) echo "${GADFLY_PROVIDER:-ollama-cloud}";; esac; }

 # Per-model status file path for the live board. The model id can contain '/'
@@ -297,6 +325,29 @@ if [ "${GADFLY_STATUS_BOARD:-1}" != "0" ]; then
  log "status board started (pid ${BOARD_PID})"
 fi

+# --- swarm-wide hard backstop ----------------------------------------------
+# A wall-clock ceiling across the WHOLE fleet, so a pathological PR can never
+# drain the usage block however the models behave. entrypoint is the only
+# process spanning every model, so a single "never exceed X" guard lives here.
+# On expiry it stops the review subtrees (the binary + run.sh); whatever partial
+# findings were gathered are still posted and the job never fails (advisory).
+# GADFLY_PR_BUDGET_SECS=0 (default) disables it.
+KILLER_PID=""
+rm -f "${WORKDIR}/.budget_killed" 2>/dev/null || true
+if [ "${GADFLY_PR_BUDGET_SECS:-0}" -gt 0 ] 2>/dev/null; then
+  (
+    sleep "${GADFLY_PR_BUDGET_SECS}"
+    log "PR wall-clock budget (${GADFLY_PR_BUDGET_SECS}s) reached — stopping the review fleet (advisory; partial findings still posted)"
+    : > "${WORKDIR}/.budget_killed"
+    pkill -TERM -f '/usr/local/bin/gadfly' 2>/dev/null || true
+    pkill -TERM -f "${SCRIPTS_DIR}/run.sh" 2>/dev/null || true
+    sleep 5
+    pkill -KILL -f '/usr/local/bin/gadfly' 2>/dev/null || true
+  ) &
+  KILLER_PID=$!
+  log "PR budget watchdog armed (${GADFLY_PR_BUDGET_SECS}s, pid ${KILLER_PID})"
+fi
+
 log "providers: ${PROVIDERS:-none}"
 # Each provider lane runs in parallel; cap is enforced within each lane. Track
 # the lane PIDs so we wait ONLY for the review work — not the status board,
@@ -308,6 +359,16 @@ for p in $PROVIDERS; do
 done
 [ "${#LANE_PIDS[@]}" -gt 0 ] && wait "${LANE_PIDS[@]}"

+# Reviews finished (or the watchdog killed them): disarm the watchdog so its
+# delayed SIGKILL can't catch the consolidation pass that runs next.
+if [ -n "$KILLER_PID" ]; then kill "$KILLER_PID" 2>/dev/null || true; fi
+
+# If the backstop fired, note it on the consensus comment (per-model comments
+# were already posted during the run; a killed model surfaces as a failed lane).
+if [ -f "${WORKDIR}/.budget_killed" ]; then
+  export GADFLY_NOTICE="${GADFLY_NOTICE:+${GADFLY_NOTICE} }⏱️ This review was stopped early by the per-PR time budget (GADFLY_PR_BUDGET_SECS); findings are partial."
+fi
+
 # Reviews are done: signal the board to render the final state once and exit.
 if [ -n "$BOARD_PID" ]; then
  touch "${STATUS_DIR}/.done" 2>/dev/null || true
@@ -331,7 +392,9 @@ if [ "$CONSOLIDATE" = "1" ]; then
  CONSENSUS="$(GADFLY_CONSOLIDATE_DIR="$FINDINGS_DIR" GADFLY_DIFF_FILE="$DIFF_FILE" \
    /usr/local/bin/gadfly 2>"${WORKDIR}/consolidate.err" || true)"
  if [ -n "$CONSENSUS" ]; then
-    BODY="$(printf '%s\n\n<sub>Automated adversarial review by Gadfly — consensus across the model swarm. Advisory only — does not block merge.</sub>' "$CONSENSUS")"
+    NOTICE_BLOCK=""
+    [ -n "${GADFLY_NOTICE:-}" ] && NOTICE_BLOCK="> ${GADFLY_NOTICE}"$'\n\n'
+    BODY="$(printf '%s%s\n\n<sub>Automated adversarial review by Gadfly — consensus across the model swarm. Advisory only — does not block merge.</sub>' "$NOTICE_BLOCK" "$CONSENSUS")"
    upsert_comment_body "<!-- gadfly-consensus -->" "$BODY"
    log "consensus comment posted"
  else