diff --git a/.gitea/workflows/review-reusable.yml b/.gitea/workflows/review-reusable.yml
index b742aed..46ff3c1 100644
--- a/.gitea/workflows/review-reusable.yml
+++ b/.gitea/workflows/review-reusable.yml
@@ -60,6 +60,7 @@ on:
worker_model: { type: string, default: "" } # GADFLY_WORKER_MODEL
allowed_users: { type: string, default: "" } # GADFLY_ALLOWED_USERS (consumer-specific; set in your stub)
trigger_phrase: { type: string, default: "" } # GADFLY_TRIGGER_PHRASE
+ consolidate: { type: string, default: "" } # GADFLY_CONSOLIDATE β "" => auto (one consensus comment for >=2 models); "0" => one comment per model
# Job wall-clock cap. 90 as a default: the 5-lens suite across a slow lane
# (claude-code with extended thinking) over two passes can run long.
timeout_minutes: { type: number, default: 90 }
@@ -143,3 +144,4 @@ jobs:
GADFLY_WORKER_MODEL: ${{ inputs.worker_model }}
GADFLY_ALLOWED_USERS: ${{ inputs.allowed_users }}
GADFLY_TRIGGER_PHRASE: ${{ inputs.trigger_phrase }}
+ GADFLY_CONSOLIDATE: ${{ inputs.consolidate }}
diff --git a/README.md b/README.md
index e27388b..59ba535 100644
--- a/README.md
+++ b/README.md
@@ -277,6 +277,35 @@ every `GADFLY_STATUS_POLL_SECS` (default 12s) until the swarm finishes. It's adv
best-effort β the per-model findings comments are unaffected β and entirely separate from those.
Turn it off with `GADFLY_STATUS_BOARD=0`.
+### Consensus consolidation
+
+With **two or more models**, posting one comment each means a reader faces N walls of prose that
+mostly agree. Instead Gadfly consolidates: every model writes its findings to a shared file, and
+after the whole swarm finishes a single pass clusters those findings by location, counts **how
+many models independently flagged each one**, and posts **one consensus comment**:
+
+```
+## πͺ° Gadfly review β consensus across 7 models
+
+**Verdict: Blocking issues found** Β· 9 findings (3 with multi-model agreement)
+
+| | Finding | Where | Models | Lens |
+|--|--|--|--|--|
+| π΄ | Auth bypass: token not verified | `auth/login.go:42` | 6/7 | security |
+| π | Unbounded retry loop | `sync/worker.go:88` | 3/7 | error-handling |
+
+4 single-model findings (lower confidence)
β¦
+Per-model detail
β¦ each model's full review, folded β¦
+```
+
+Cross-model agreement is the strongest real-vs-false-positive signal available, so findings are
+ranked by it (a lone low-severity finding folds away; a lone *critical* still surfaces). The
+per-model comments are suppressed in this mode β each model's full review is preserved, folded,
+inside the consensus comment β and nothing is lost: if consolidation can't run, Gadfly falls
+back to posting the per-model comments. Controlled by `GADFLY_CONSOLIDATE`: `auto` (default β on
+for β₯2 models), `1` (force on), `0` (force off, one comment per model). Single-model runs are
+unaffected.
+
### Triggers
1. A **new/reopened/ready** non-draft PR β automatic.
@@ -362,6 +391,7 @@ The reviewer binary reads these (the stub/entrypoint set sane defaults):
| `GADFLY_MAX_DIFF_CHARS` | 60000 | diff chars embedded in the prompt (full diff via `get_diff`) |
| `GADFLY_STATUS_BOARD` | on | set `0` to disable the live status-board comment |
| `GADFLY_STATUS_POLL_SECS` | 12 | how often the status board re-renders/upserts |
+| `GADFLY_CONSOLIDATE` | `auto` | cross-model consensus comment: `auto` (on for β₯2 models), `1` (force on), `0` (off β one comment per model) |
| `GADFLY_TRIGGER_PHRASE` | `@gadfly review` | comment phrase that re-triggers |
| `GADFLY_ALLOWED_USERS` | *(collaborators)* | comma-separated allow-list for comment triggers |
| `GADFLY_FINDINGS_URL` | β | gadfly-reports store base URL; set to enable findings telemetry (off when empty) |
diff --git a/cmd/gadfly/consensus.go b/cmd/gadfly/consensus.go
new file mode 100644
index 0000000..be04092
--- /dev/null
+++ b/cmd/gadfly/consensus.go
@@ -0,0 +1,456 @@
+package main
+
+// Cross-model consensus consolidation. The swarm runs each model independently
+// (entrypoint.sh fans them out across provider lanes); historically each model
+// posted its OWN comment, so a reader faced N walls of prose that mostly agreed.
+//
+// Instead, every model writes its findings to a shared directory
+// (GADFLY_FINDINGS_OUT, one JSON file per model), and after the whole swarm
+// finishes a single consolidation pass (GADFLY_CONSOLIDATE_DIR) clusters those
+// findings by location, counts how many models independently flagged each one,
+// and renders ONE comment: an agreement-ranked table up top (cross-model
+// agreement is the strongest real-vs-false-positive signal we have), with each
+// model's full review folded below for drill-down.
+//
+// This file owns: the per-model artifact (modelFindings), writing it
+// (writeFindingsOut), reading the directory back, clustering, and rendering the
+// consensus markdown (renderConsensus). It depends only on the stdlib.
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "sort"
+ "strings"
+)
+
+// consensusMarker tags the single consolidated comment so entrypoint.sh can
+// upsert it in place across re-runs (mirrors run.sh's per-model marker).
+const consensusMarker = ""
+
+// modelFindings is the per-model artifact written to GADFLY_FINDINGS_OUT. It
+// carries enough to rebuild the consolidated comment without re-running models:
+// the structured findings (for clustering) plus the full rendered review (for
+// the folded per-model drill-down).
+type modelFindings struct {
+ Model string `json:"model"`
+ Provider string `json:"provider"`
+ Verdict string `json:"verdict"` // worst lens verdict, as a label
+ Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed)
+ Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped)
+ Findings []outFinding `json:"findings"`
+}
+
+// outFinding is one finding in the per-model artifact (a flattened `finding`
+// plus its lens).
+type outFinding struct {
+ Lens string `json:"lens"`
+ File string `json:"file"`
+ Line int `json:"line"`
+ Severity string `json:"severity"`
+ Confidence string `json:"confidence"`
+ Title string `json:"title"`
+ Detail string `json:"detail"`
+}
+
+// collectFindings returns a lens result's findings with severity always filled
+// in: per-finding when the structured block supplied it, else derived from the
+// lens verdict (so heuristic-scraped findings still carry a canonical word). A
+// clean or errored lens yields nothing. Shared by the telemetry emit and the
+// per-model findings file so both agree on what a finding is.
+func collectFindings(r specialistResult) []finding {
+ if r.errored || r.verdict == verdictClean {
+ return nil
+ }
+ fs := extractStructuredFindingsOrScrape(r)
+ lensSev := r.verdict.severity()
+ for i := range fs {
+ if fs[i].severity == "" {
+ fs[i].severity = lensSev
+ }
+ }
+ return fs
+}
+
+// writeFindingsOut writes this model's findings + rendered review to
+// GADFLY_FINDINGS_OUT for the later consolidation pass. No-op unless the env is
+// set. Best-effort: any error is logged to stderr and never affects the review
+// (it runs after the markdown is already on stdout).
+func writeFindingsOut(results []specialistResult) {
+ path := strings.TrimSpace(os.Getenv("GADFLY_FINDINGS_OUT"))
+ if path == "" {
+ return
+ }
+ mf := modelFindings{
+ Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
+ Provider: modelProvider(),
+ Verdict: worstVerdict(results).label(),
+ Errored: allErrored(results),
+ Markdown: renderConsolidated(results),
+ }
+ for _, r := range results {
+ for _, f := range collectFindings(r) {
+ mf.Findings = append(mf.Findings, outFinding{
+ Lens: r.spec.Name,
+ File: f.file,
+ Line: f.line,
+ Severity: f.severity,
+ Confidence: f.confidence,
+ Title: f.title,
+ Detail: f.detail,
+ })
+ }
+ }
+ data, err := json.Marshal(mf)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
+ return
+ }
+ // Defensive: make sure the parent dir exists (entrypoint creates it, but a
+ // missing dir would otherwise silently drop this model from the consensus).
+ if dir := filepath.Dir(path); dir != "" {
+ _ = os.MkdirAll(dir, 0o755)
+ }
+ if err := os.WriteFile(path, data, 0o644); err != nil {
+ fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
+ }
+}
+
+// allErrored reports whether every lens of a review failed (so the model
+// produced no usable findings). Such a model is recorded but excluded from the
+// consensus agreement denominator β counting it would dilute every ratio with a
+// model that never actually reviewed.
+func allErrored(results []specialistResult) bool {
+ if len(results) == 0 {
+ return true
+ }
+ for _, r := range results {
+ if !r.errored {
+ return false
+ }
+ }
+ return true
+}
+
+// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
+// read every per-model artifact in the directory, render the consensus comment
+// to stdout. Errors are fatal to THIS process only β entrypoint.sh treats a
+// failed consolidation as advisory and falls back to per-model comments.
+func runConsolidate() error {
+ dir := strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR"))
+ if dir == "" {
+ return errors.New("GADFLY_CONSOLIDATE_DIR is empty")
+ }
+ entries, err := os.ReadDir(dir)
+ if err != nil {
+ return fmt.Errorf("read consolidate dir: %w", err)
+ }
+ var models []modelFindings
+ for _, e := range entries {
+ if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+ continue
+ }
+ data, err := os.ReadFile(filepath.Join(dir, e.Name()))
+ if err != nil {
+ fmt.Fprintln(os.Stderr, "gadfly: read", e.Name(), err)
+ continue
+ }
+ var mf modelFindings
+ if err := json.Unmarshal(data, &mf); err != nil {
+ fmt.Fprintln(os.Stderr, "gadfly: parse", e.Name(), err)
+ continue
+ }
+ if strings.TrimSpace(mf.Model) == "" {
+ continue
+ }
+ models = append(models, mf)
+ }
+ if len(models) == 0 {
+ return errors.New("no model findings to consolidate")
+ }
+ // Lead with the marker so entrypoint.sh can upsert this comment in place
+ // (same pattern as run.sh's per-model marker); it appends the advisory footer.
+ fmt.Println(consensusMarker)
+ fmt.Println(renderConsensus(models))
+ return nil
+}
+
+// cluster is a group of findings (across models) judged to be the same issue:
+// same file, lines within lineTolerance of the cluster's current span. The span
+// [line,maxLine] slides as members join, so a chain of nearby findings merges
+// instead of splitting once it drifts past the first line.
+type cluster struct {
+ file string
+ line int // representative (smallest) line
+ maxLine int // largest line in the cluster β the span's upper edge
+ severity string
+ title string
+ models map[string]bool
+ lenses map[string]bool
+}
+
+// findingRef is one model's finding (carrying which model reported it), used
+// while grouping findings into clusters.
+type findingRef struct {
+ f outFinding
+ model string
+}
+
+// lineTolerance: a finding in the same file within this many lines of a
+// cluster's current span is treated as the same issue (models often cite a line
+// or two apart).
+const lineTolerance = 3
+
+// renderConsensus builds the single consolidated comment body from every model's
+// findings. It does NOT emit the marker or advisory footer β entrypoint.sh wraps
+// it (mirroring run.sh's per-model framing).
+func renderConsensus(models []modelFindings) string {
+ // effective = models that actually produced a review. Errored models are
+ // shown (folded, below) but excluded from the agreement denominator so a
+ // failed model doesn't dilute every ratio.
+ effective := 0
+ for _, m := range models {
+ if !m.Errored {
+ effective++
+ }
+ }
+ errored := len(models) - effective
+ clusters := clusterFindings(models)
+
+ worst := verdictClean
+ for _, m := range models {
+ if v := parseVerdict(m.Verdict); v > worst {
+ worst = v
+ }
+ }
+
+ // Partition in one pass: "headline" findings (multi-model agreement, OR a
+ // lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
+ // count multi-model agreements for the summary line.
+ var headline, folded []cluster
+ agreed := 0
+ for _, c := range clusters {
+ if len(c.models) >= 2 {
+ agreed++
+ }
+ if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
+ headline = append(headline, c)
+ } else {
+ folded = append(folded, c)
+ }
+ }
+
+ var b strings.Builder
+ fmt.Fprintf(&b, "## πͺ° Gadfly review β consensus across %d model%s", effective, plural(effective))
+ if errored > 0 {
+ fmt.Fprintf(&b, " (%d failed)", errored)
+ }
+ b.WriteString("\n\n")
+ fmt.Fprintf(&b, "**Verdict: %s** Β· %d finding%s (%d with multi-model agreement)\n",
+ worst.label(), len(clusters), plural(len(clusters)), agreed)
+
+ if len(headline) > 0 {
+ b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
+ for _, c := range headline {
+ fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
+ sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
+ len(c.models), effective, mdCell(lensList(c.lenses)))
+ }
+ } else if len(clusters) == 0 {
+ b.WriteString("\nNo material issues found by consensus.\n")
+ }
+ // else: only single-model findings β they're shown folded below, so don't
+ // claim "no material issues" (there are some, just none with consensus).
+
+ if len(folded) > 0 {
+ fmt.Fprintf(&b, "\n%d single-model finding%s (lower confidence)
\n\n",
+ len(folded), plural(len(folded)))
+ b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
+ for _, c := range folded {
+ fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
+ sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
+ mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
+ }
+ b.WriteString("\n \n")
+ }
+
+ // Per-model full reviews, folded for drill-down (nothing is lost).
+ b.WriteString("\nPer-model detail
\n")
+ for _, m := range models {
+ body := strings.TrimSpace(m.Markdown)
+ if body == "" {
+ body = "_(no output)_"
+ }
+ verdict := m.Verdict
+ if m.Errored {
+ verdict = "β οΈ reviewer failed"
+ }
+ fmt.Fprintf(&b, "\n%s (%s) β %s
\n\n%s\n\n \n",
+ mdCell(m.Model), mdCell(m.Provider), verdict, body)
+ }
+ b.WriteString("\n ")
+ return b.String()
+}
+
+// clusterFindings groups every model's findings into cross-model clusters,
+// sorted by agreement (desc), then severity (desc), then location.
+func clusterFindings(models []modelFindings) []cluster {
+ // Group by file, then greedily merge by line proximity.
+ byFile := map[string][]findingRef{}
+ for _, m := range models {
+ for _, f := range m.Findings {
+ if strings.TrimSpace(f.File) == "" {
+ continue
+ }
+ byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
+ }
+ }
+
+ var clusters []cluster
+ for file, items := range byFile {
+ sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
+ // Cluster within THIS file only (clusters never span files), so the inner
+ // scan is over same-file clusters, not every cluster seen so far.
+ var fileClusters []cluster
+ for _, it := range items {
+ placed := false
+ for ci := range fileClusters {
+ c := &fileClusters[ci]
+ // Join if the line falls within the cluster's span, widened by the
+ // tolerance on both edges β so the window slides as the span grows.
+ if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
+ mergeIntoCluster(c, it.f, it.model)
+ placed = true
+ break
+ }
+ }
+ if !placed {
+ c := cluster{
+ file: file,
+ line: it.f.Line,
+ maxLine: it.f.Line,
+ severity: it.f.Severity,
+ title: it.f.Title,
+ models: map[string]bool{},
+ lenses: map[string]bool{},
+ }
+ mergeIntoCluster(&c, it.f, it.model)
+ fileClusters = append(fileClusters, c)
+ }
+ }
+ clusters = append(clusters, fileClusters...)
+ }
+
+ sort.SliceStable(clusters, func(i, j int) bool {
+ if len(clusters[i].models) != len(clusters[j].models) {
+ return len(clusters[i].models) > len(clusters[j].models)
+ }
+ if sevRank(clusters[i].severity) != sevRank(clusters[j].severity) {
+ return sevRank(clusters[i].severity) > sevRank(clusters[j].severity)
+ }
+ if clusters[i].file != clusters[j].file {
+ return clusters[i].file < clusters[j].file
+ }
+ return clusters[i].line < clusters[j].line
+ })
+ return clusters
+}
+
+// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
+// widen the [line,maxLine] span, and keep the highest-severity report's title.
+func mergeIntoCluster(c *cluster, f outFinding, model string) {
+ if model != "" {
+ c.models[model] = true
+ }
+ if f.Lens != "" {
+ c.lenses[f.Lens] = true
+ }
+ if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
+ c.line = f.Line
+ }
+ if f.Line > c.maxLine {
+ c.maxLine = f.Line
+ }
+ if sevRank(f.Severity) > sevRank(c.severity) {
+ c.severity = f.Severity
+ if strings.TrimSpace(f.Title) != "" {
+ c.title = f.Title
+ }
+ }
+}
+
+// sevRank orders the canonical severity words for sorting/comparison.
+func sevRank(s string) int {
+ switch strings.ToLower(strings.TrimSpace(s)) {
+ case "critical":
+ return 5
+ case "high":
+ return 4
+ case "medium":
+ return 3
+ case "small":
+ return 2
+ case "trivial":
+ return 1
+ default:
+ return 0
+ }
+}
+
+// sevIcon is the at-a-glance severity badge for the consensus table.
+func sevIcon(s string) string {
+ switch strings.ToLower(strings.TrimSpace(s)) {
+ case "critical", "high":
+ return "π΄"
+ case "medium":
+ return "π "
+ case "small":
+ return "π‘"
+ default:
+ return "βͺ"
+ }
+}
+
+func location(file string, line int) string {
+ if line > 0 {
+ return fmt.Sprintf("%s:%d", file, line)
+ }
+ return file
+}
+
+func lensList(lenses map[string]bool) string {
+ out := make([]string, 0, len(lenses))
+ for l := range lenses {
+ out = append(out, l)
+ }
+ sort.Strings(out)
+ return strings.Join(out, ", ")
+}
+
+func oneModel(models map[string]bool) string {
+ for m := range models {
+ return m
+ }
+ return ""
+}
+
+// mdCell makes a string safe for a one-line markdown table cell: collapse
+// newlines, escape pipes (which delimit columns), and neutralize backticks
+// (a stray one would break an inline-code span β a backslash can't escape it
+// inside code, so replace with an apostrophe). Inputs are model-influenced, so
+// this keeps a malformed file path or title from breaking the table.
+func mdCell(s string) string {
+ s = strings.ReplaceAll(s, "\n", " ")
+ s = strings.ReplaceAll(s, "|", "\\|")
+ s = strings.ReplaceAll(s, "`", "'")
+ return strings.TrimSpace(s)
+}
+
+func plural(n int) string {
+ if n == 1 {
+ return ""
+ }
+ return "s"
+}
diff --git a/cmd/gadfly/consensus_test.go b/cmd/gadfly/consensus_test.go
new file mode 100644
index 0000000..595d54b
--- /dev/null
+++ b/cmd/gadfly/consensus_test.go
@@ -0,0 +1,217 @@
+package main
+
+import (
+ "encoding/json"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+func TestClusterFindingsAgreementAndTolerance(t *testing.T) {
+ models := []modelFindings{
+ {Model: "m1", Verdict: "Blocking issues found", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+ {Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"},
+ }},
+ {Model: "m2", Verdict: "Minor issues", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 11, Severity: "critical", Title: "auth bypass (crit)"}, // within tolerance of a.go:10
+ }},
+ {Model: "m3", Verdict: "Minor issues", Findings: []outFinding{
+ {Lens: "correctness", File: "a.go", Line: 10, Severity: "medium", Title: "auth bypass"},
+ }},
+ }
+ clusters := clusterFindings(models)
+ if len(clusters) != 2 {
+ t.Fatalf("want 2 clusters (a.go:10Β±, b.go:5), got %d: %+v", len(clusters), clusters)
+ }
+ // First cluster (highest agreement) is the a.go auth one: 3 models, severity
+ // escalated to critical, representative line the smallest (10).
+ c := clusters[0]
+ if len(c.models) != 3 {
+ t.Errorf("want 3 models on the top cluster, got %d", len(c.models))
+ }
+ if c.severity != "critical" {
+ t.Errorf("want escalated severity critical, got %q", c.severity)
+ }
+ if c.line != 10 {
+ t.Errorf("want representative line 10, got %d", c.line)
+ }
+ if !c.lenses["security"] || !c.lenses["correctness"] {
+ t.Errorf("want union of lenses, got %v", c.lenses)
+ }
+}
+
+func TestRenderConsensusFoldsSingleModelNits(t *testing.T) {
+ models := []modelFindings{
+ {Model: "m1", Provider: "p", Verdict: "Blocking issues found", Markdown: "m1 detail", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+ {Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"},
+ }},
+ {Model: "m2", Provider: "p", Verdict: "Minor issues", Markdown: "m2 detail", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+ }},
+ }
+ out := renderConsensus(models)
+ // Headline table: the agreed finding with a 2/2 badge.
+ if !strings.Contains(out, "2/2") {
+ t.Errorf("expected a 2/2 agreement badge in headline:\n%s", out)
+ }
+ if !strings.Contains(out, "auth bypass") || !strings.Contains(out, "a.go:10") {
+ t.Errorf("headline missing the consensus finding:\n%s", out)
+ }
+ // The lone trivial finding is folded, not in the headline table.
+ if !strings.Contains(out, "single-model finding") {
+ t.Errorf("expected a folded single-model section:\n%s", out)
+ }
+ // Per-model detail is preserved (folded).
+ if !strings.Contains(out, "m1 detail") || !strings.Contains(out, "m2 detail") {
+ t.Errorf("per-model detail not preserved:\n%s", out)
+ }
+}
+
+func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) {
+ // A single model, single critical finding must still surface in the headline
+ // (not be folded as "low confidence").
+ models := []modelFindings{
+ {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 1, Severity: "critical", Title: "rce"},
+ }},
+ }
+ out := renderConsensus(models)
+ headline := out
+ if i := strings.Index(out, "single-model finding"); i >= 0 {
+ headline = out[:i]
+ }
+ if !strings.Contains(headline, "rce") {
+ t.Errorf("lone critical should be in the headline, not folded:\n%s", out)
+ }
+}
+
+func TestClusterSlidingWindowMergesChain(t *testing.T) {
+ // Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE
+ // cluster β the window slides with the span instead of anchoring at line 10.
+ models := []modelFindings{
+ {Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}},
+ {Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}},
+ {Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}},
+ }
+ clusters := clusterFindings(models)
+ if len(clusters) != 1 {
+ t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters))
+ }
+ if len(clusters[0].models) != 3 {
+ t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models))
+ }
+}
+
+func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) {
+ models := []modelFindings{
+ {Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+ {Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+ {Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"},
+ }
+ out := renderConsensus(models)
+ // Denominator is the 2 effective models, not 3; the failure is noted.
+ if !strings.Contains(out, "2/2") {
+ t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out)
+ }
+ if !strings.Contains(out, "1 failed") {
+ t.Errorf("expected a '1 failed' note:\n%s", out)
+ }
+ if !strings.Contains(out, "reviewer failed") {
+ t.Errorf("errored model should still appear (folded) as failed:\n%s", out)
+ }
+}
+
+func TestRenderConsensusLoneHighFolds(t *testing.T) {
+ // A single-model HIGH (not critical) folds β only consensus or a lone CRITICAL
+ // earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise.
+ models := []modelFindings{
+ {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}},
+ }
+ out := renderConsensus(models)
+ head := out
+ if i := strings.Index(out, "single-model finding"); i >= 0 {
+ head = out[:i]
+ }
+ if strings.Contains(head, "maybe-bug") {
+ t.Errorf("a lone HIGH should fold, not headline:\n%s", out)
+ }
+}
+
+func TestWriteAndConsolidateRoundTrip(t *testing.T) {
+ dir := t.TempDir()
+
+ // Two model artifacts on disk.
+ write := func(name string, mf modelFindings) {
+ data, _ := json.Marshal(mf)
+ if err := os.WriteFile(filepath.Join(dir, name), data, 0o644); err != nil {
+ t.Fatal(err)
+ }
+ }
+ write("m1.json", modelFindings{Model: "m1", Provider: "ollama", Verdict: "Minor issues", Markdown: "md1",
+ Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "medium", Title: "leak"}}})
+ write("m2.json", modelFindings{Model: "m2", Provider: "ollama", Verdict: "Minor issues", Markdown: "md2",
+ Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "high", Title: "leak"}}})
+ // A junk file must be skipped, not crash consolidation.
+ if err := os.WriteFile(filepath.Join(dir, "notes.txt"), []byte("ignore me"), 0o644); err != nil {
+ t.Fatal(err)
+ }
+
+ t.Setenv("GADFLY_CONSOLIDATE_DIR", dir)
+ // runConsolidate prints to stdout; capture it.
+ out := captureStdout(t, func() {
+ if err := runConsolidate(); err != nil {
+ t.Fatalf("runConsolidate: %v", err)
+ }
+ })
+ if !strings.HasPrefix(strings.TrimSpace(out), consensusMarker) {
+ t.Errorf("consolidated output must lead with the marker:\n%s", out)
+ }
+ if !strings.Contains(out, "2/2") || !strings.Contains(out, "x.go:3") {
+ t.Errorf("expected the agreed x.go:3 finding at 2/2:\n%s", out)
+ }
+}
+
+func TestRunConsolidateEmptyDirErrors(t *testing.T) {
+ t.Setenv("GADFLY_CONSOLIDATE_DIR", t.TempDir())
+ if err := runConsolidate(); err == nil {
+ t.Error("want an error for an empty consolidate dir (entrypoint falls back)")
+ }
+}
+
+// captureStdout redirects os.Stdout for the duration of fn and returns what was
+// written.
+func captureStdout(t *testing.T, fn func()) string {
+ t.Helper()
+ orig := os.Stdout
+ r, w, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ os.Stdout = w
+ defer func() { os.Stdout = orig }() // restore even if fn panics
+ done := make(chan string)
+ go func() {
+ var sb strings.Builder
+ buf := make([]byte, 4096)
+ for {
+ n, err := r.Read(buf)
+ if n > 0 {
+ sb.Write(buf[:n])
+ }
+ if err != nil {
+ break
+ }
+ }
+ r.Close()
+ done <- sb.String()
+ }()
+ fn()
+ w.Close()
+ return <-done
+}
diff --git a/cmd/gadfly/emit.go b/cmd/gadfly/emit.go
index fbac59d..f1d76ba 100644
--- a/cmd/gadfly/emit.go
+++ b/cmd/gadfly/emit.go
@@ -125,32 +125,12 @@ func emit(results []specialistResult, elapsed time.Duration) {
// InputTokens/OutputTokens/CostUSD stay nil -> JSON null (not metered).
}
+ // collectFindings (consensus.go) skips clean/errored lenses and fills in each
+ // finding's severity (per-finding when structured, else derived from the lens
+ // verdict), so every reportPayload carries a canonical raw_severity.
var reports []reportPayload
for _, r := range results {
- if r.errored {
- continue // a failed lens contributes no findings
- }
- // A lens that reports "No material issues found" has nothing to flag β
- // its path:line references are verification notes ("verified X at
- // file:line is safe"), not problems. Extracting them pollutes the
- // findings store with false positives and unfairly penalizes thorough
- // reviewers that do clean passes, so a clean lens emits no findings.
- if r.verdict == verdictClean {
- continue
- }
- // Prefer the model's machine-readable ```gadfly-findings block (exact
- // location + per-finding severity). Fall back to the prose scrape when it
- // yields NOTHING β no block, an unterminated/malformed one, or a block that
- // parsed to zero usable findings on a non-clean lens (e.g. an empty []
- // alongside prose findings). Falling back on empty, not just on absent,
- // keeps the safety net from being defeated by a contradictory empty block.
- fs := extractStructuredFindingsOrScrape(r)
- lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity)
- for _, f := range fs {
- sev := f.severity
- if sev == "" {
- sev = lensSev
- }
+ for _, f := range collectFindings(r) {
reports = append(reports, reportPayload{
Repo: repo,
PR: pr,
@@ -161,7 +141,7 @@ func emit(results []specialistResult, elapsed time.Duration) {
Model: model,
Provider: provider,
RunID: runID,
- RawSeverity: sev,
+ RawSeverity: f.severity,
Confidence: f.confidence,
Detail: f.detail,
})
diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go
index 20f3564..57943a1 100644
--- a/cmd/gadfly/main.go
+++ b/cmd/gadfly/main.go
@@ -123,6 +123,14 @@ func main() {
func run() error {
start := time.Now()
+
+ // Consolidation mode: not a review at all β read the per-model findings the
+ // swarm wrote and print the single cross-model consensus comment. entrypoint.sh
+ // runs this once, after every model has finished.
+ if strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR")) != "" {
+ return runConsolidate()
+ }
+
repoDir := os.Getenv("GADFLY_REPO_DIR")
diffFile := os.Getenv("GADFLY_DIFF_FILE")
systemFile := os.Getenv("GADFLY_SYSTEM_FILE")
@@ -214,6 +222,11 @@ func run() error {
// Optional, best-effort telemetry. OFF unless GADFLY_FINDINGS_URL is set;
// any failure is logged to stderr and never affects stdout or the exit code.
emit(results, time.Since(start))
+
+ // Optional per-model findings artifact for the cross-model consolidation
+ // pass. No-op unless GADFLY_FINDINGS_OUT is set (entrypoint sets it for a
+ // multi-model swarm). Best-effort, never affects stdout or the exit code.
+ writeFindingsOut(results)
return nil
}
diff --git a/entrypoint.sh b/entrypoint.sh
index e15972c..46d4e76 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -45,6 +45,8 @@
# GADFLY_FINDINGS_URL optional gadfly-reports store base URL; set to POST the run +
# findings for model-quality tracking (off when empty)
# GADFLY_FINDINGS_TOKEN optional bearer token for the gadfly-reports store
+# GADFLY_CONSOLIDATE cross-model consensus comment: "auto" (default; on for >=2
+# models), "1" force on, "0" force off (one comment per model)
set -uo pipefail
# One model by default: the specialist suite already provides breadth, so a
@@ -64,6 +66,29 @@ die() { log "ERROR: $*"; exit 1; }
API() { curl -fsS --connect-timeout 20 --max-time 30 -H "Authorization: token ${GITEA_TOKEN}" "$@"; }
+# upsert_comment_body MARKER BODY β create or update (by leading MARKER) a single
+# PR comment. Mirrors run.sh's per-model upsert; used for the consensus comment
+# and the per-model fallback when consolidation is on.
+upsert_comment_body() {
+ local marker="$1" body="$2" post_body existing_id="" page=1 cmts
+ post_body="$(jq -n --arg b "$body" '{body:$b}')"
+ while [ "$page" -le 10 ]; do
+ cmts="$(API "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" || echo '[]')"
+ [ "$(echo "$cmts" | jq 'length')" = "0" ] && break
+ existing_id="$(echo "$cmts" | jq -r --arg m "$marker" \
+ '.[] | select(.body != null and (.body | startswith($m))) | .id' | head -n1)"
+ [ -n "$existing_id" ] && break
+ page=$((page+1))
+ done
+ if [ -n "$existing_id" ]; then
+ curl -sS --connect-timeout 20 --max-time 30 -X PATCH -H "Authorization: token ${GITEA_TOKEN}" \
+ -H "Content-Type: application/json" "${GITEA_API}/issues/comments/${existing_id}" -d "$post_body" >/dev/null
+ else
+ curl -sS --connect-timeout 20 --max-time 30 -X POST -H "Authorization: token ${GITEA_TOKEN}" \
+ -H "Content-Type: application/json" "${GITEA_API}/issues/${PR}/comments" -d "$post_body" >/dev/null
+ fi
+}
+
# --- is the commenter allowed to trigger a re-review? ----------------------
actor_allowed() {
local actor="$1"
@@ -176,10 +201,11 @@ provider_cap() { # provider -> concurrency (override map "p=N,...", else default
}
review_one() {
- local sf=""
+ local sf="" ff=""
[ "${GADFLY_STATUS_BOARD:-1}" != "0" ] && sf="$(status_file_for "$1")"
+ [ "$CONSOLIDATE" = "1" ] && ff="$(findings_file_for "$1")"
PROVIDER=ollama MODEL="$1" GADFLY_BIN="/usr/local/bin/gadfly" GADFLY_REPO_DIR="$REPO_DIR" \
- GADFLY_STATUS_FILE="$sf" \
+ GADFLY_STATUS_FILE="$sf" GADFLY_FINDINGS_OUT="$ff" GADFLY_CONSOLIDATE="$CONSOLIDATE" \
bash "${SCRIPTS_DIR}/run.sh" || log "model $1 failed (continuing)"
# If the binary never wrote real status (run.sh skipped it: empty diff, no key,
# binary missing), the pre-seed stays {started:0, done:false} and the board
@@ -196,6 +222,34 @@ IFS=',' read -ra _raw <<< "$MODELS" || true
MODEL_LIST=()
for raw in "${_raw[@]}"; do m="$(echo "$raw" | tr -d '[:space:]')"; [ -n "$m" ] && MODEL_LIST+=("$m"); done
+# --- cross-model consolidation decision ------------------------------------
+# With >=2 models, post ONE consensus comment (findings clustered + ranked by
+# cross-model agreement) instead of N per-model walls of prose. Each model writes
+# its findings to FINDINGS_DIR; a final pass (the binary in GADFLY_CONSOLIDATE_DIR
+# mode) renders the consensus comment. GADFLY_CONSOLIDATE: "auto" (default; on for
+# >=2 models), "1" force on, "0" force off (keep per-model comments).
+FINDINGS_DIR="${WORKDIR}/findings"
+CONSOLIDATE=0
+case "${GADFLY_CONSOLIDATE:-auto}" in
+ 1) CONSOLIDATE=1 ;;
+ 0) CONSOLIDATE=0 ;;
+ *) [ "${#MODEL_LIST[@]}" -ge 2 ] && CONSOLIDATE=1 ;;
+esac
+# A model spec can contain '/' and ':' (e.g. claude-code/opus, qwen3:14b), so
+# sanitize to a flat filename β but append a checksum of the raw spec so two
+# specs that sanitize the same (foo:bar vs foo/bar -> foo_bar) don't collide onto
+# one file and silently drop a model from the consensus.
+findings_file_for() {
+ local safe sum
+ safe="$(echo "$1" | tr -c '[:alnum:]._-' '_')"
+ sum="$(printf '%s' "$1" | cksum | cut -d' ' -f1)"
+ echo "${FINDINGS_DIR}/${safe}-${sum}.json"
+}
+if [ "$CONSOLIDATE" = "1" ]; then
+ rm -rf "$FINDINGS_DIR"; mkdir -p "$FINDINGS_DIR"
+ log "consolidation ON: ${#MODEL_LIST[@]} models -> one consensus comment"
+fi
+
# Distinct providers, in first-seen order (no associative arrays β portable).
PROVIDERS=""
for m in "${MODEL_LIST[@]}"; do
@@ -257,4 +311,34 @@ if [ -n "$BOARD_PID" ]; then
touch "${STATUS_DIR}/.done" 2>/dev/null || true
wait "$BOARD_PID" 2>/dev/null || true
fi
+
+# --- cross-model consensus comment -----------------------------------------
+# Render ONE consensus comment from the per-model findings the swarm wrote. This
+# is advisory and best-effort: if the consolidation pass produces nothing, fall
+# back to posting each model's review as its own comment (the per-model comments
+# were suppressed during the run), so a consolidation hiccup never loses output.
+if [ "$CONSOLIDATE" = "1" ]; then
+ n_files="$(ls -1 "${FINDINGS_DIR}"/*.json 2>/dev/null | wc -l | tr -d '[:space:]')"
+ log "consolidating findings from ${n_files} model(s)"
+ CONSENSUS="$(GADFLY_CONSOLIDATE_DIR="$FINDINGS_DIR" /usr/local/bin/gadfly 2>"${WORKDIR}/consolidate.err" || true)"
+ if [ -n "$CONSENSUS" ]; then
+ BODY="$(printf '%s\n\nAutomated adversarial review by Gadfly β consensus across the model swarm. Advisory only β does not block merge.' "$CONSENSUS")"
+ upsert_comment_body "" "$BODY"
+ log "consensus comment posted"
+ else
+ log "consolidation produced no output; falling back to per-model comments"
+ log "$(tail -c 500 "${WORKDIR}/consolidate.err" 2>/dev/null)"
+ for f in "${FINDINGS_DIR}"/*.json; do
+ [ -f "$f" ] || continue
+ m="$(jq -r '.model // ""' "$f" 2>/dev/null)"
+ [ -z "$m" ] && continue
+ prov="$(jq -r '.provider // ""' "$f" 2>/dev/null)"
+ md="$(jq -r '.markdown // ""' "$f" 2>/dev/null)"
+ marker=""
+ body="$(printf '%s\n### πͺ° Gadfly review β `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β does not block merge.' \
+ "$marker" "$m" "$prov" "$md")"
+ upsert_comment_body "$marker" "$body"
+ done
+ fi
+fi
log "done"
diff --git a/scripts/run.sh b/scripts/run.sh
index 8e9b396..0a5fe49 100644
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -50,6 +50,13 @@ MAX_DIFF_CHARS="${MAX_DIFF_CHARS:-60000}"
MARKER=""
say() { echo "[gadfly-review:${PROVIDER}:${MODEL}] $*" >&2; }
+# When the swarm is consolidating (GADFLY_CONSOLIDATE=1, set by entrypoint.sh for
+# a multi-model run), this model does NOT post its own comment β it writes its
+# findings to GADFLY_FINDINGS_OUT and a single cross-model consensus comment is
+# posted after the whole swarm finishes. Live progress still shows on the status
+# board. Default 0 (post a per-model comment, the standalone behavior).
+CONSOLIDATE="${GADFLY_CONSOLIDATE:-0}"
+
# Display the model's ACTUAL backend: the provider segment of the spec
# ("m1pro/qwen3.6:35b-mlx" -> "m1pro"); a bare id uses GADFLY_PROVIDER (default
# ollama-cloud). This is what the comment header shows, not the run.sh lane.
@@ -126,7 +133,9 @@ USR="$(printf 'PR #%s: %s\n\nDescription:\n%s\n\nUnified diff to review:\n```dif
# --- announce start (placeholder comment) -----------------------------------
START_TS="$(date +%s)"
say "starting review with ${MODEL}"
-upsert_comment "$(printf '%s\n### πͺ° Gadfly review β `%s` (%s)\n\nβ³ Reviewingβ¦ this comment will update with findings and run time.' \
+# Skip the per-model placeholder when consolidating (the consensus comment is
+# posted later; live progress is on the status board).
+[ "$CONSOLIDATE" = "1" ] || upsert_comment "$(printf '%s\n### πͺ° Gadfly review β `%s` (%s)\n\nβ³ Reviewingβ¦ this comment will update with findings and run time.' \
"$MARKER" "$MODEL" "$MODEL_PROVIDER")"
# --- call the model ---------------------------------------------------------
@@ -170,6 +179,7 @@ case "$PROVIDER" in
GADFLY_BODY="$BODY" \
GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \
GADFLY_STATUS_FILE="${GADFLY_STATUS_FILE:-}" \
+ GADFLY_FINDINGS_OUT="${GADFLY_FINDINGS_OUT:-}" \
"$BIN" 2>"$ERR_FILE"
)"
rc=$?
@@ -204,7 +214,24 @@ esac
# --- assemble + post final comment (with run time) --------------------------
ELAPSED="$(( $(date +%s) - START_TS ))"
DUR="$(fmt_duration "$ELAPSED")"
-COMMENT="$(printf '%s\n### πͺ° Gadfly review β `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β does not block merge. Β· β±οΈ reviewed in %s' \
- "$MARKER" "$MODEL" "$MODEL_PROVIDER" "$REVIEW" "$DUR")"
-upsert_comment "$COMMENT"
-say "done in ${DUR}"
+
+# Consolidating: the binary writes its findings file on success. If it failed or
+# was skipped (no file, or an empty one), write a stub so this model still shows
+# up in the consensus (as failed) and an all-models-fail run still posts a
+# comment β never silently drop a model or the whole review.
+if [ "$CONSOLIDATE" = "1" ] && [ -n "${GADFLY_FINDINGS_OUT:-}" ] && [ ! -s "${GADFLY_FINDINGS_OUT}" ]; then
+ jq -n --arg model "$MODEL" --arg provider "$MODEL_PROVIDER" --arg md "$REVIEW" \
+ '{model:$model, provider:$provider, verdict:"reviewer failed", errored:true, markdown:$md, findings:[]}' \
+ > "${GADFLY_FINDINGS_OUT}" 2>/dev/null || true
+fi
+# When consolidating, the binary has written this model's findings to
+# GADFLY_FINDINGS_OUT; the consensus comment is posted by entrypoint.sh after the
+# whole swarm finishes, so this model posts no comment of its own.
+if [ "$CONSOLIDATE" = "1" ]; then
+ say "done in ${DUR} (consolidated; no per-model comment)"
+else
+ COMMENT="$(printf '%s\n### πͺ° Gadfly review β `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β does not block merge. Β· β±οΈ reviewed in %s' \
+ "$MARKER" "$MODEL" "$MODEL_PROVIDER" "$REVIEW" "$DUR")"
+ upsert_comment "$COMMENT"
+ say "done in ${DUR}"
+fi