From 1cdda32dbcab237dcf94b44d1244e7a323a743a0 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 14:18:28 -0400 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20live=20status-board=20comment=20?= =?UTF-8?q?=E2=80=94=20per-model/per-lens=20review=20progress?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 of the gadfly-games build. With several models × several lenses reviewing a PR, all you'd see mid-run is a row of "⏳ Reviewing…" placeholders. Add ONE consolidated, live-updating status-board comment that aggregates every model's per-lens progress (queued → running → finished + verdict), so progress is visible at a glance and a watcher can tell when the whole swarm is done. - cmd/gadfly: opt-in statusWriter (GADFLY_STATUS_FILE) publishes this model's lenses to a JSON file, written atomically (temp+rename) as runSpecialists transitions each lens. Inert when unset — plain runs and tests are unaffected. - scripts/status-board.sh: background renderer that polls the status dir and upserts one marker comment every GADFLY_STATUS_POLL_SECS (default 12s), caching the comment id to PATCH in place. Advisory and best-effort; the per-model findings comments are untouched. - entrypoint.sh: pre-seeds every model as queued, launches the board, waits only on the review lanes, then signals .done for a final render. Default on; disable with GADFLY_STATUS_BOARD=0. - Docs: README config table + "Live status board" section, example stub note, CLAUDE.md architecture map. gofmt clean, go vet quiet, go build + go test -race green. Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 1 + Dockerfile | 2 +- README.md | 27 +++++++ cmd/gadfly/main.go | 10 ++- cmd/gadfly/status.go | 131 +++++++++++++++++++++++++++++++ cmd/gadfly/status_test.go | 103 +++++++++++++++++++++++++ entrypoint.sh | 47 +++++++++++- examples/adversarial-review.yml | 4 + scripts/run.sh | 5 +- scripts/status-board.sh | 132 ++++++++++++++++++++++++++++++++ 10 files changed, 457 insertions(+), 5 deletions(-) create mode 100644 cmd/gadfly/status.go create mode 100644 cmd/gadfly/status_test.go create mode 100755 scripts/status-board.sh diff --git a/CLAUDE.md b/CLAUDE.md index ce156e0..8d49c16 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,6 +39,7 @@ cmd/gadfly/ the reviewer binary — pure producer of review markdown recheck.go second-pass verification prompt + verdict recompute *_test.go sandbox, recheck, wrap-up, spec/endpoint-parse, specialist-resolution tests scripts/run.sh fetch PR diff+meta, run the binary, upsert ONE labeled PR comment +scripts/status-board.sh render+upsert ONE live status-board comment (per-model/per-lens progress) scripts/system-prompt.txt the reviewer persona + verification discipline (generic, not repo-specific) entrypoint.sh container brains: trigger gating, PR clone, model loop (the logic that used to live in workflow YAML) diff --git a/Dockerfile b/Dockerfile index a76891f..b2ae511 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,5 +28,5 @@ RUN apk add --no-cache bash git curl jq ca-certificates COPY --from=build /out/gadfly /usr/local/bin/gadfly COPY scripts /app/scripts COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh /app/scripts/run.sh /usr/local/bin/gadfly +RUN chmod +x /entrypoint.sh /app/scripts/run.sh /app/scripts/status-board.sh /usr/local/bin/gadfly ENTRYPOINT ["/entrypoint.sh"] diff --git a/README.md b/README.md index b6f799b..78ffaba 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,30 @@ GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3,m1=1" GADFLY_SPECIALISTS: "security,correctness,error-handling" ``` +### Live status board + +When several models (each with several lenses) review a PR, the individual findings land in +**one comment per model** — but while that's in flight all you'd see is a row of +`⏳ Reviewing…` placeholders. So Gadfly also upserts **one consolidated status-board comment** +that aggregates every model's per-lens progress as it happens: + +``` +## 🪰 Gadfly — live review status +1/3 reviewers finished · updated 2026-06-27 18:14:56Z + +#### `glm-5.2:cloud` · ollama-cloud — ⏳ 2/4 lenses +- ✅ security — No material issues found +- 🔄 correctness — running +- ⏸️ performance — queued +… +``` + +Each model process publishes its lenses (queued → running → finished + verdict) to a small +JSON file, and a background renderer in `entrypoint.sh` re-renders + upserts the single comment +every `GADFLY_STATUS_POLL_SECS` (default 12s) until the swarm finishes. It's advisory and +best-effort — the per-model findings comments are unaffected — and entirely separate from those. +Turn it off with `GADFLY_STATUS_BOARD=0`. + ### Triggers 1. A **new/reopened/ready** non-draft PR — automatic. @@ -217,6 +241,7 @@ fixes. This keeps usage down.) ``` cmd/gadfly/ the agentic reviewer binary (majordomo + Ollama Cloud); zero deps beyond stdlib + majordomo scripts/run.sh fetches the PR diff, runs the reviewer, upserts one labeled comment +scripts/status-board.sh renders + upserts the single live status-board comment (per-lens progress) scripts/system-prompt.txt the reviewer persona + verification discipline entrypoint.sh the container brains: trigger gating, clone, model loop (logic lives here, not in YAML) Dockerfile multi-stage; build-time module creds (BuildKit secrets) never reach the final image @@ -252,6 +277,8 @@ The reviewer binary reads these (the stub/entrypoint set sane defaults): | `GADFLY_RECHECK` | on | set `0`/`false` to skip the recheck pass | | `GADFLY_RECHECK_MAX_STEPS` | 16 | recheck-pass step cap | | `GADFLY_MAX_DIFF_CHARS` | 60000 | diff chars embedded in the prompt (full diff via `get_diff`) | +| `GADFLY_STATUS_BOARD` | on | set `0` to disable the live status-board comment | +| `GADFLY_STATUS_POLL_SECS` | 12 | how often the status board re-renders/upserts | | `GADFLY_TRIGGER_PHRASE` | `@gadfly review` | comment phrase that re-triggers | | `GADFLY_ALLOWED_USERS` | *(collaborators)* | comma-separated allow-list for comment triggers | | `GADFLY_FINDINGS_URL` | — | gadfly-reports store base URL; set to enable findings telemetry (off when empty) | diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index 19eecb9..caf41d7 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -218,6 +218,11 @@ func run() error { func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []Specialist, task, diff string) []specialistResult { results := make([]specialistResult, len(specialists)) + // Optional live status board: publishes this model's per-lens progress to a + // file the entrypoint board renders. Inert (no-op) unless GADFLY_STATUS_FILE + // is set, so plain runs are unaffected. + sw := newStatusWriter(os.Getenv("GADFLY_MODEL"), modelProvider(), specialists) + conc := min(lensConcurrency(), len(specialists)) sem := make(chan struct{}, conc) @@ -228,8 +233,11 @@ func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []S go func(i int, sp Specialist) { defer wg.Done() defer func() { <-sem }() + sw.set(sp.Name, lensRunning, "", false) out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff) - results[i] = specialistResult{spec: sp, out: out, verdict: parseVerdict(out), errored: errored} + v := parseVerdict(out) + results[i] = specialistResult{spec: sp, out: out, verdict: v, errored: errored} + sw.set(sp.Name, lensFinished, v.label(), errored) }(i, sp) } wg.Wait() diff --git a/cmd/gadfly/status.go b/cmd/gadfly/status.go new file mode 100644 index 0000000..8681672 --- /dev/null +++ b/cmd/gadfly/status.go @@ -0,0 +1,131 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "sync" + "time" +) + +// Lens states for the live status board. A lens starts queued, becomes running +// when its pass begins, and ends finished (with a verdict, or errored). +const ( + lensQueued = "queued" + lensRunning = "running" + lensFinished = "finished" +) + +// lensStatus is one specialist lens's progress, as rendered by the entrypoint +// status board (scripts/status-board.sh). +type lensStatus struct { + Name string `json:"name"` + State string `json:"state"` // queued | running | finished + Verdict string `json:"verdict,omitempty"` // set when finished (the lens's label) + Errored bool `json:"errored,omitempty"` // the lens failed to complete +} + +// modelStatus is the on-disk shape one model process publishes for the live +// status board: a snapshot of this model's lenses as they progress. The board +// reads every model's file and renders a single consolidated PR comment. +type modelStatus struct { + Model string `json:"model"` + Provider string `json:"provider"` + Started int64 `json:"started"` // unix seconds + Updated int64 `json:"updated"` // unix seconds, bumped on every change + Done bool `json:"done"` // all lenses finished + Lenses []lensStatus `json:"lenses"` +} + +// statusWriter maintains a model's status file as its lenses progress. It is +// purely opt-in: when GADFLY_STATUS_FILE is unset the writer's path is empty and +// every method is a no-op, so a plain run (and the unit tests) never touch the +// filesystem and behave exactly as before. Writes are atomic (temp file + +// rename within the same dir) so the board never reads a half-written file even +// though lenses can finish concurrently. +type statusWriter struct { + path string + mu sync.Mutex + st modelStatus +} + +// newStatusWriter seeds a writer with every lens queued and flushes the initial +// snapshot. model/provider are echoed into the file so the board can render +// them without re-deriving from the filename (which is sanitized). The status +// file path comes from GADFLY_STATUS_FILE (set by run.sh per model); when empty +// the writer is inert. +func newStatusWriter(model, provider string, specialists []Specialist) *statusWriter { + w := &statusWriter{path: strings.TrimSpace(os.Getenv("GADFLY_STATUS_FILE"))} + w.st = modelStatus{ + Model: model, + Provider: provider, + Started: time.Now().Unix(), + } + for _, sp := range specialists { + w.st.Lenses = append(w.st.Lenses, lensStatus{Name: sp.Name, State: lensQueued}) + } + w.flush() + return w +} + +// set transitions a lens to a new state (and verdict/errored when finished), +// recomputes the overall done flag, and atomically rewrites the file. Unknown +// lens names are ignored. Safe for concurrent callers (one goroutine per lens). +func (w *statusWriter) set(name, state, verdict string, errored bool) { + if w == nil || w.path == "" { + return + } + w.mu.Lock() + defer w.mu.Unlock() + for i := range w.st.Lenses { + if w.st.Lenses[i].Name == name { + w.st.Lenses[i].State = state + w.st.Lenses[i].Verdict = verdict + w.st.Lenses[i].Errored = errored + break + } + } + done := true + for _, l := range w.st.Lenses { + if l.State != lensFinished { + done = false + break + } + } + w.st.Done = done + w.flush() +} + +// flush writes the current snapshot atomically. Best-effort: any error is +// swallowed (the status board is advisory and must never affect the review). +func (w *statusWriter) flush() { + if w.path == "" { + return + } + w.st.Updated = time.Now().Unix() + data, err := json.MarshalIndent(&w.st, "", " ") + if err != nil { + return + } + dir := filepath.Dir(w.path) + tmp, err := os.CreateTemp(dir, ".status-*.tmp") + if err != nil { + return + } + tmpName := tmp.Name() + if _, err := tmp.Write(data); err != nil { + tmp.Close() + os.Remove(tmpName) + return + } + if err := tmp.Close(); err != nil { + os.Remove(tmpName) + return + } + // Rename is atomic within the same filesystem, so the board reader sees + // either the old complete file or the new complete file — never a partial. + if err := os.Rename(tmpName, w.path); err != nil { + os.Remove(tmpName) + } +} diff --git a/cmd/gadfly/status_test.go b/cmd/gadfly/status_test.go new file mode 100644 index 0000000..6e556bd --- /dev/null +++ b/cmd/gadfly/status_test.go @@ -0,0 +1,103 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" +) + +// readStatus loads a modelStatus written by the statusWriter. +func readStatus(t *testing.T, path string) modelStatus { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read status file: %v", err) + } + var st modelStatus + if err := json.Unmarshal(data, &st); err != nil { + t.Fatalf("unmarshal status: %v", err) + } + return st +} + +func TestStatusWriterLifecycle(t *testing.T) { + path := filepath.Join(t.TempDir(), "glm.json") + t.Setenv("GADFLY_STATUS_FILE", path) + + specs := []Specialist{ + {Name: "security", Title: "Security"}, + {Name: "correctness", Title: "Correctness"}, + } + w := newStatusWriter("glm-5.2:cloud", "ollama-cloud", specs) + + // Initial snapshot: both lenses queued, model not done, metadata echoed. + st := readStatus(t, path) + if st.Model != "glm-5.2:cloud" || st.Provider != "ollama-cloud" { + t.Fatalf("model/provider not echoed: %+v", st) + } + if len(st.Lenses) != 2 { + t.Fatalf("want 2 lenses, got %d", len(st.Lenses)) + } + for _, l := range st.Lenses { + if l.State != lensQueued { + t.Fatalf("lens %q want queued, got %q", l.Name, l.State) + } + } + if st.Done { + t.Fatal("model marked done while lenses still queued") + } + if st.Started == 0 { + t.Fatal("started timestamp not set") + } + + // Transition one lens through running -> finished; model not yet done. + w.set("security", lensRunning, "", false) + if got := readStatus(t, path); got.Lenses[0].State != lensRunning { + t.Fatalf("security want running, got %q", got.Lenses[0].State) + } + w.set("security", lensFinished, "No material issues found", false) + st = readStatus(t, path) + if st.Lenses[0].State != lensFinished || st.Lenses[0].Verdict != "No material issues found" { + t.Fatalf("security finish not recorded: %+v", st.Lenses[0]) + } + if st.Done { + t.Fatal("model marked done with one lens still queued") + } + + // Finish the second lens (errored) -> model done. + w.set("correctness", lensFinished, "Reviewed", true) + st = readStatus(t, path) + if !st.Done { + t.Fatal("model should be done after all lenses finished") + } + if !st.Lenses[1].Errored { + t.Fatal("errored flag not recorded for correctness") + } + if st.Updated < st.Started { + t.Fatalf("updated (%d) should be >= started (%d)", st.Updated, st.Started) + } +} + +// With GADFLY_STATUS_FILE unset the writer is inert: no file, no panic. +func TestStatusWriterDisabled(t *testing.T) { + t.Setenv("GADFLY_STATUS_FILE", "") + w := newStatusWriter("m", "p", []Specialist{{Name: "security"}}) + w.set("security", lensFinished, "Minor issues", false) + // Nothing to assert beyond "did not panic / did not write" — a nil-safe set + // on the disabled writer is the contract. + if w.path != "" { + t.Fatalf("expected empty path when disabled, got %q", w.path) + } +} + +// set must ignore unknown lens names rather than panic or append. +func TestStatusWriterUnknownLens(t *testing.T) { + path := filepath.Join(t.TempDir(), "s.json") + t.Setenv("GADFLY_STATUS_FILE", path) + w := newStatusWriter("m", "p", []Specialist{{Name: "security"}}) + w.set("does-not-exist", lensRunning, "", false) + if st := readStatus(t, path); len(st.Lenses) != 1 || st.Lenses[0].State != lensQueued { + t.Fatalf("unknown lens mutated state: %+v", st.Lenses) + } +} diff --git a/entrypoint.sh b/entrypoint.sh index b25f358..f53a670 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -155,6 +155,12 @@ DEFAULT_CONC="${GADFLY_CONCURRENCY:-1}" provider_of() { case "$1" in */*) echo "${1%%/*}";; *) echo "${GADFLY_PROVIDER:-ollama-cloud}";; esac; } +# Per-model status file path for the live board. The model id can contain '/' +# and ':' (e.g. m1/qwen3:14b), so sanitize to a flat filename; the JSON inside +# carries the real model/provider, so this just needs to be unique per model. +STATUS_DIR="${WORKDIR}/status" +status_file_for() { echo "${STATUS_DIR}/$(echo "$1" | tr -c '[:alnum:]._-' '_').json"; } + provider_cap() { # provider -> concurrency (override map "p=N,...", else default) local p="$1" item k v IFS=',' read -ra _caps <<< "${GADFLY_PROVIDER_CONCURRENCY:-}" @@ -167,7 +173,10 @@ provider_cap() { # provider -> concurrency (override map "p=N,...", else default } review_one() { + local sf="" + [ "${GADFLY_STATUS_BOARD:-1}" != "0" ] && sf="$(status_file_for "$1")" PROVIDER=ollama MODEL="$1" GADFLY_BIN="/usr/local/bin/gadfly" GADFLY_REPO_DIR="$REPO_DIR" \ + GADFLY_STATUS_FILE="$sf" \ bash "${SCRIPTS_DIR}/run.sh" || log "model $1 failed (continuing)" } @@ -197,10 +206,44 @@ run_lane() { # $1=provider: run its models, at most `cap` at a time wait } +# --- live status board (optional, default on) ------------------------------ +# Each model process publishes per-lens progress to STATUS_DIR/.json; a +# background renderer (status-board.sh) upserts ONE consolidated PR comment so +# progress across all models/lenses is visible at a glance — and a watcher can +# tell when the whole swarm is finished. Advisory/best-effort; the per-model +# findings still land in each model's own comment. Disable with +# GADFLY_STATUS_BOARD=0. +BOARD_PID="" +if [ "${GADFLY_STATUS_BOARD:-1}" != "0" ]; then + rm -rf "$STATUS_DIR"; mkdir -p "$STATUS_DIR" + # Pre-seed every model as queued so the board shows the full swarm from t=0, + # even models still waiting on their provider lane's concurrency cap. Each + # binary overwrites its own file with real per-lens detail once it starts. + for m in "${MODEL_LIST[@]}"; do + jq -n --arg model "$m" --arg provider "$(provider_of "$m")" \ + '{model:$model, provider:$provider, started:0, updated:0, done:false, lenses:[]}' \ + > "$(status_file_for "$m")" 2>/dev/null || true + done + GITEA_API="$GITEA_API" GITEA_TOKEN="$GITEA_TOKEN" PR="$PR" GADFLY_STATUS_DIR="$STATUS_DIR" \ + bash "${SCRIPTS_DIR}/status-board.sh" & + BOARD_PID=$! + log "status board started (pid ${BOARD_PID})" +fi + log "providers: ${PROVIDERS:-none}" -# Each provider lane runs in parallel; cap is enforced within each lane. +# Each provider lane runs in parallel; cap is enforced within each lane. Track +# the lane PIDs so we wait ONLY for the review work — not the status board, +# which intentionally runs until we signal it below. +LANE_PIDS=() for p in $PROVIDERS; do run_lane "$p" & + LANE_PIDS+=("$!") done -wait +[ "${#LANE_PIDS[@]}" -gt 0 ] && wait "${LANE_PIDS[@]}" + +# Reviews are done: signal the board to render the final state once and exit. +if [ -n "$BOARD_PID" ]; then + touch "${STATUS_DIR}/.done" 2>/dev/null || true + wait "$BOARD_PID" 2>/dev/null || true +fi log "done" diff --git a/examples/adversarial-review.yml b/examples/adversarial-review.yml index 66a2f0a..82befb7 100644 --- a/examples/adversarial-review.yml +++ b/examples/adversarial-review.yml @@ -65,6 +65,10 @@ jobs: # GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3,m1=1" # GADFLY_LENS_CONCURRENCY: ${{ vars.GADFLY_LENS_CONCURRENCY }} # GADFLY_PROVIDER_LENS_CONCURRENCY: ${{ vars.GADFLY_PROVIDER_LENS_CONCURRENCY }} + # Live status board (optional; ON by default): one consolidated comment + # showing every model's per-lens progress as it runs. Disable with + # GADFLY_STATUS_BOARD=0; tune the refresh with GADFLY_STATUS_POLL_SECS. + # GADFLY_STATUS_BOARD: ${{ vars.GADFLY_STATUS_BOARD }} # --- Models & providers (optional; default = Ollama Cloud) ---------- # Gadfly is majordomo-powered, so it can target other backends. Set a # provider for bare model ids; point at a different endpoint with a diff --git a/scripts/run.sh b/scripts/run.sh index 2cbb097..0394f9e 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -24,7 +24,9 @@ # antigravity: `agy` on PATH with credentials already seeded (~/.gemini) # # Optional: -# MAX_DIFF_CHARS diff truncation cap for the prompt (default 60000) +# MAX_DIFF_CHARS diff truncation cap for the prompt (default 60000) +# GADFLY_STATUS_FILE per-model JSON path for the live status board (set by +# entrypoint.sh; empty/unset disables status publishing) # # This script is advisory: it never fails the job for review content. It exits # non-zero only on a usage/configuration error. @@ -161,6 +163,7 @@ case "$PROVIDER" in GADFLY_TITLE="$TITLE" \ GADFLY_BODY="$BODY" \ GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \ + GADFLY_STATUS_FILE="${GADFLY_STATUS_FILE:-}" \ "$BIN" 2>"$ERR_FILE" )" rc=$? diff --git a/scripts/status-board.sh b/scripts/status-board.sh new file mode 100755 index 0000000..1da535b --- /dev/null +++ b/scripts/status-board.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# Live status board for a Gadfly review. +# +# Each model process (the cmd/gadfly binary) publishes its per-lens progress to +# $GADFLY_STATUS_DIR/.json as lenses go queued -> running -> finished. +# This script polls that directory and upserts ONE consolidated PR comment that +# aggregates every model's per-lens status, so a human (or an agent watching the +# PR) can see the whole swarm's progress at a glance and know when it's done — +# instead of staring at N separate "⏳ Reviewing…" placeholders. +# +# It is advisory and best-effort: a failed render/post is logged and retried on +# the next tick; nothing here can fail the review or block a merge. It runs in +# the background from entrypoint.sh and exits once the $GADFLY_STATUS_DIR/.done +# sentinel appears (the entrypoint touches it after all model lanes finish), +# after one final render. +# +# Required env: +# GITEA_API https://HOST/api/v1/repos/OWNER/REPO +# GITEA_TOKEN token with repo write access (posts the comment) +# PR pull request number +# GADFLY_STATUS_DIR directory holding the per-model .json files +# Optional: +# GADFLY_STATUS_POLL_SECS render/upsert interval (default 12) +set -uo pipefail + +: "${GITEA_API:?GITEA_API required}" +: "${GITEA_TOKEN:?GITEA_TOKEN required}" +: "${PR:?PR required}" +: "${GADFLY_STATUS_DIR:?GADFLY_STATUS_DIR required}" + +POLL="${GADFLY_STATUS_POLL_SECS:-12}" +DONE_FILE="${GADFLY_STATUS_DIR}/.done" +MARKER="" +API_TIMEOUT="--connect-timeout 20 --max-time 30" +BOARD_ID="" # cached comment id, so we PATCH in place instead of re-searching + +say() { echo "[gadfly-status-board] $*" >&2; } + +command -v jq >/dev/null 2>&1 || { say "jq not found; status board disabled"; exit 0; } + +# render_section FILE -> markdown for one model (its header + per-lens bullets). +# Reads the JSON the binary writes; tolerates a half-written/missing file by +# emitting nothing (jq exits non-zero -> caller skips it this tick). +render_section() { + jq -r ' + def icon(state; errored): + if state == "finished" then (if errored then "⚠️" else "✅" end) + elif state == "running" then "🔄" + else "⏸️" end; + def lensline: + "- " + icon(.state; (.errored // false)) + " **" + .name + "** — " + + ( if .state == "finished" then (if (.errored // false) then "could not complete" else (.verdict // "done") end) + elif .state == "running" then "running" + else "queued" end ); + ( [.lenses[] | select(.state == "finished")] | length ) as $fin + | ( .lenses | length ) as $tot + | ( if .done then "✅ done" + elif $tot == 0 then "⏳ waiting to start" + else "⏳ " + ($fin|tostring) + "/" + ($tot|tostring) + " lenses" end ) as $sum + | "#### `" + .model + "` · " + .provider + " — " + $sum + "\n" + + ( if $tot == 0 then "- ⏸️ _no lenses reported yet_" + else ([.lenses[] | lensline] | join("\n")) end ) + ' "$1" 2>/dev/null +} + +# render_body -> the full consolidated comment body (marker + header + sections). +render_body() { + local f sections="" total=0 done=0 ts + shopt -s nullglob + local files=("${GADFLY_STATUS_DIR}"/*.json) + shopt -u nullglob + for f in "${files[@]}"; do + local sec + sec="$(render_section "$f")" || continue + [ -z "$sec" ] && continue + total=$((total + 1)) + if [ "$(jq -r 'if .done then 1 else 0 end' "$f" 2>/dev/null)" = "1" ]; then + done=$((done + 1)) + fi + sections="${sections}${sec}"$'\n\n' + done + ts="$(date -u '+%Y-%m-%d %H:%M:%SZ')" + if [ "$total" -eq 0 ]; then + sections="_Waiting for reviewers to start…_"$'\n' + fi + printf '%s\n## 🪰 Gadfly — live review status\n\n%d/%d reviewers finished · updated %s\n\n%s\nLive status board. Findings are posted in each model'\''s own comment. Advisory only — does not block merge.' \ + "$MARKER" "$done" "$total" "$ts" "$sections" +} + +# find_existing -> id of the board comment if it already exists (paginate by +# marker). Used once, to recover the comment across container restarts. +find_existing() { + local page=1 cmts id + while [ "$page" -le 10 ]; do + cmts="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" 2>/dev/null || echo '[]')" + [ "$(echo "$cmts" | jq 'length' 2>/dev/null || echo 0)" = "0" ] && break + id="$(echo "$cmts" | jq -r --arg m "$MARKER" \ + '.[] | select(.body != null and (.body | startswith($m))) | .id' 2>/dev/null | head -n1)" + [ -n "$id" ] && { echo "$id"; return; } + page=$((page + 1)) + done + echo "" +} + +# upsert BODY — PATCH the cached/known board comment, else POST a new one and +# cache its id. A failed PATCH (e.g. comment deleted) clears the cache so the +# next tick re-discovers or re-creates it. +upsert() { + local body="$1" post_body resp + post_body="$(jq -n --arg b "$body" '{body:$b}')" + [ -z "$BOARD_ID" ] && BOARD_ID="$(find_existing)" + if [ -n "$BOARD_ID" ]; then + if ! curl $API_TIMEOUT -fsS -X PATCH -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/comments/${BOARD_ID}" -d "$post_body" >/dev/null 2>&1; then + say "patch of comment ${BOARD_ID} failed; will re-discover" + BOARD_ID="" + fi + else + resp="$(curl $API_TIMEOUT -fsS -X POST -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/${PR}/comments" -d "$post_body" 2>/dev/null || echo '{}')" + BOARD_ID="$(echo "$resp" | jq -r '.id // ""' 2>/dev/null)" + fi +} + +say "starting (poll ${POLL}s, dir ${GADFLY_STATUS_DIR})" +while :; do + upsert "$(render_body)" + [ -f "$DONE_FILE" ] && break + sleep "$POLL" +done +say "done" -- 2.52.0 From 48af34f4ca6ac55ee91187037e3668ebf428fcbc Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 14:31:07 -0400 Subject: [PATCH 2/4] =?UTF-8?q?ci:=20dogfood=20the=20FULL=20fleet=20?= =?UTF-8?q?=E2=80=94=209=20cloud=20+=202=20Macs,=20matching=20mort?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dogfood workflow had a truncated 5-model list (3 cloud + the 2 Macs) and was missing GADFLY_PROVIDER_LENS_CONCURRENCY. Restore mort's full fleet so gadfly reviews its own PRs with the same 11 reviewers and the model-quality scoreboard is comparable across both repos: 9 cloud: minimax-m3, glm-5.2, glm-5.1, kimi-k2.7-code, deepseek-v4-pro, nemotron-3-super, gpt-oss:120b, qwen3-coder:480b, gemma4 2 local: m1/qwen3:14b, m5/qwen3.6:35b-mlx GADFLY_MODELS / *_CONCURRENCY / *_LENS_CONCURRENCY now match mort's adversarial-review.yml verbatim. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/adversarial-review.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml index 419d6d4..21354a4 100644 --- a/.gitea/workflows/adversarial-review.yml +++ b/.gitea/workflows/adversarial-review.yml @@ -4,7 +4,7 @@ # caches :latest, and this build is what carries foreman provider-type support) # as a specialist swarm and posts # ONE consolidated review comment as gitea-actions. Advisory only — never blocks a -# merge. Gadfly reviewing its OWN PRs — dogfooding, full fleet (3 cloud + the M1/M5 +# merge. Gadfly reviewing its OWN PRs — dogfooding, full fleet (9 cloud + the M1/M5 # Macs), copied from mort's setup. name: Adversarial Review (Gadfly) @@ -41,7 +41,7 @@ jobs: || github.actor == 'fizi' || github.actor == 'dazed')) runs-on: ubuntu-latest - # Full fleet (3 cloud + 2 local Macs, all running concurrently) reviewing + # Full fleet (9 cloud + 2 local Macs, all running concurrently) reviewing # every PR with the 3-lens suite — the slow local lanes dominate wall time. timeout-minutes: 90 steps: @@ -66,10 +66,14 @@ jobs: # still post. (Gitea secrets aren't auto-exposed — map each explicitly.) GADFLY_ENDPOINT_M1: ${{ secrets.GADFLY_ENDPOINT_M1 }} GADFLY_ENDPOINT_M5: ${{ secrets.GADFLY_ENDPOINT_M5 }} - # 3 cloud (parallel) + M1 Pro + M5 Max — one consolidated comment each. - GADFLY_MODELS: "minimax-m3:cloud,deepseek-v4-flash:cloud,glm-5.2:cloud,m1/qwen3:14b,m5/qwen3.6:35b-mlx" + # Full fleet: 9 cloud (3 at a time) + M1 Pro + M5 Max — one consolidated + # comment each. Matches mort's setup so the model-quality scoreboard is + # comparable across both repos. + GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,kimi-k2.7-code:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,gpt-oss:120b-cloud,qwen3-coder:480b-cloud,gemma4:cloud,m1/qwen3:14b,m5/qwen3.6:35b-mlx" # cloud runs 3 at once; each Mac one at a time; all three lanes parallel. GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,m1=1,m5=1" + # 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget). + GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3" # Default => the 3-lens suite (security, correctness, error-handling). # Set the repo var GADFLY_SPECIALISTS to override (csv / "all" / "auto"). GADFLY_SPECIALISTS: ${{ vars.GADFLY_SPECIALISTS || 'security,correctness,error-handling' }} -- 2.52.0 From 0e5e0ff089873d3e09191ea294e45ce504a6ce89 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 14:39:31 -0400 Subject: [PATCH 3/4] ci: drop M1 from gadfly's dogfood swarm (keep M5) The M1 Pro lane is too slow / low-signal for reviewing gadfly's own PRs, so remove it from the dogfood fleet: out of GADFLY_MODELS, out of GADFLY_PROVIDER_CONCURRENCY, and its GADFLY_ENDPOINT_M1 mapping dropped. M5 stays. (mort still runs both Macs.) Fleet is now 9 cloud + M5. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/adversarial-review.yml | 30 +++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml index 21354a4..09c7378 100644 --- a/.gitea/workflows/adversarial-review.yml +++ b/.gitea/workflows/adversarial-review.yml @@ -4,8 +4,8 @@ # caches :latest, and this build is what carries foreman provider-type support) # as a specialist swarm and posts # ONE consolidated review comment as gitea-actions. Advisory only — never blocks a -# merge. Gadfly reviewing its OWN PRs — dogfooding, full fleet (9 cloud + the M1/M5 -# Macs), copied from mort's setup. +# merge. Gadfly reviewing its OWN PRs — dogfooding, full cloud fleet (9 cloud + +# the M5 Mac; M1 dropped as too slow), copied from mort's setup. name: Adversarial Review (Gadfly) @@ -41,8 +41,8 @@ jobs: || github.actor == 'fizi' || github.actor == 'dazed')) runs-on: ubuntu-latest - # Full fleet (9 cloud + 2 local Macs, all running concurrently) reviewing - # every PR with the 3-lens suite — the slow local lanes dominate wall time. + # Fleet (9 cloud + 1 local Mac/M5, all running concurrently) reviewing + # every PR with the 3-lens suite — the slow local lane dominates wall time. timeout-minutes: 90 steps: - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-d7f364d @@ -50,10 +50,12 @@ jobs: GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} OLLAMA_CLOUD_API_KEY: ${{ secrets.OLLAMA_CLOUD_API_KEY }} - # Local Macs, reached through their foreman queues (native Ollama on the + # Local Mac (M5), reached through its foreman queue (native Ollama on the # wire). Gadfly's GADFLY_ENDPOINT_* form with the "foreman" provider - # type: GADFLY_ENDPOINT_M1 registers provider "m1", _M5 registers "m5", - # each building a foreman-preset Ollama client at the given URL. Values + # type: GADFLY_ENDPOINT_M5 registers provider "m5", building a + # foreman-preset Ollama client at the given URL. (M1 is dropped from + # gadfly's swarm — too slow/low-signal — so its endpoint isn't mapped.) + # Values # (host + token) live in gitea secrets, each of the form: # foreman|https://| # (converted from the komodo LLM_* DSNs foreman://@). @@ -64,14 +66,14 @@ jobs: # NOTE: the Mac behind each foreman must still be awake/reachable; if a # box is offline, that model's comment shows an error and the others # still post. (Gitea secrets aren't auto-exposed — map each explicitly.) - GADFLY_ENDPOINT_M1: ${{ secrets.GADFLY_ENDPOINT_M1 }} GADFLY_ENDPOINT_M5: ${{ secrets.GADFLY_ENDPOINT_M5 }} - # Full fleet: 9 cloud (3 at a time) + M1 Pro + M5 Max — one consolidated - # comment each. Matches mort's setup so the model-quality scoreboard is - # comparable across both repos. - GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,kimi-k2.7-code:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,gpt-oss:120b-cloud,qwen3-coder:480b-cloud,gemma4:cloud,m1/qwen3:14b,m5/qwen3.6:35b-mlx" - # cloud runs 3 at once; each Mac one at a time; all three lanes parallel. - GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,m1=1,m5=1" + # Fleet: 9 cloud (3 at a time) + M5 Max — one consolidated comment each. + # Matches mort's cloud set so the model-quality scoreboard is comparable + # across both repos. NOTE: M1 Pro is intentionally dropped here (too slow + # / low-signal for gadfly's own PRs); mort still runs it. + GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,kimi-k2.7-code:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,gpt-oss:120b-cloud,qwen3-coder:480b-cloud,gemma4:cloud,m5/qwen3.6:35b-mlx" + # cloud runs 3 at once; the Mac one at a time; both lanes parallel. + GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,m5=1" # 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget). GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3" # Default => the 3-lens suite (security, correctness, error-handling). -- 2.52.0 From a1b0691a1ef7a7c760263e29da0bff90d615dfc7 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 14:56:41 -0400 Subject: [PATCH 4/4] fix: fold in gadfly's own review findings (3 real bugs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dogfood swarm reviewed PR #1; folding in the warranted findings (graded via the gadfly MCP — 18 real / 18 false-positive across the 4 completed reviewers): - entrypoint.sh: finalize a never-written status file when run.sh skips the binary (empty diff / no key / missing binary). The pre-seed stayed {started:0, done:false}, so the board showed that model "waiting to start" forever and the N/N counter never completed — breaking the board's own "tell when everything is finished" invariant. (glm-5.2, correctness — the strongest finding.) - main.go: recover() in the per-lens goroutine. A panic previously crashed the whole binary (killing every other lens's output) and left the lens stuck "running" on the board. Now it's recorded as an errored result and the lens is marked finished. (glm-5.2 + minimax-m3.) - status-board.sh: coerce a non-numeric GADFLY_STATUS_POLL_SECS back to 12. Under `set -uo pipefail` a bad `sleep "$POLL"` failed silently and the loop spun, hammering the Gitea API. (glm-5.2, error-handling.) The remaining real findings (sanitizer collision, page-10 pagination, markdown-injection via PR-controlled lens names, cosmetic blank line) were graded trivial and left as-is — documented in the finding notes. gofmt clean, go vet quiet, go build + go test -race green. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/gadfly/main.go | 10 ++++++++++ entrypoint.sh | 8 ++++++++ scripts/status-board.sh | 5 +++++ 3 files changed, 23 insertions(+) diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index caf41d7..3718c56 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -233,6 +233,16 @@ func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []S go func(i int, sp Specialist) { defer wg.Done() defer func() { <-sem }() + // A panic in one lens must not crash the whole binary (which would + // kill every other lens's output) or leave this lens stuck at + // "running" on the status board. Recover, record it as an errored + // result, and mark the lens finished so the board can complete. + defer func() { + if r := recover(); r != nil { + results[i] = specialistResult{spec: sp, out: fmt.Sprintf("⚠️ This reviewer panicked: %v", r), verdict: verdictUnknown, errored: true} + sw.set(sp.Name, lensFinished, "", true) + } + }() sw.set(sp.Name, lensRunning, "", false) out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff) v := parseVerdict(out) diff --git a/entrypoint.sh b/entrypoint.sh index f53a670..23649b3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -178,6 +178,14 @@ review_one() { PROVIDER=ollama MODEL="$1" GADFLY_BIN="/usr/local/bin/gadfly" GADFLY_REPO_DIR="$REPO_DIR" \ GADFLY_STATUS_FILE="$sf" \ bash "${SCRIPTS_DIR}/run.sh" || log "model $1 failed (continuing)" + # If the binary never wrote real status (run.sh skipped it: empty diff, no key, + # binary missing), the pre-seed stays {started:0, done:false} and the board + # would show this model "waiting to start" forever and never reach N/N. Mark + # such a never-started file done so the board can complete. The binary stamps a + # nonzero `started`, so that reliably distinguishes "ran" from "skipped". + if [ -n "$sf" ] && [ -f "$sf" ] && [ "$(jq -r '.started // 0' "$sf" 2>/dev/null)" = "0" ]; then + tmp="$(jq '.done = true' "$sf" 2>/dev/null)" && printf '%s' "$tmp" > "$sf" + fi } # Normalize the model list (trim, drop blanks) into MODEL_LIST. diff --git a/scripts/status-board.sh b/scripts/status-board.sh index 1da535b..f207a1d 100755 --- a/scripts/status-board.sh +++ b/scripts/status-board.sh @@ -29,6 +29,11 @@ set -uo pipefail : "${GADFLY_STATUS_DIR:?GADFLY_STATUS_DIR required}" POLL="${GADFLY_STATUS_POLL_SECS:-12}" +# Guard against a non-numeric poll interval: with `set -uo pipefail` (no set -e) +# a bad `sleep "$POLL"` would fail silently and the `while :` loop would spin, +# hammering the Gitea API. Coerce anything non-integer (or <1) back to 12. +case "$POLL" in ''|*[!0-9]*) POLL=12 ;; esac +[ "$POLL" -ge 1 ] 2>/dev/null || POLL=12 DONE_FILE="${GADFLY_STATUS_DIR}/.done" MARKER="" API_TIMEOUT="--connect-timeout 20 --max-time 30" -- 2.52.0