feat: cross-model consensus consolidation (one ranked comment, not N walls) (#17)

Co-authored-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com> Co-committed-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
2026-06-28 22:56:15 +00:00
parent 84b891b1ba
commit 88f74aa768
8 changed files with 841 additions and 32 deletions
@@ -0,0 +1,456 @@
+package main
+
+// Cross-model consensus consolidation. The swarm runs each model independently
+// (entrypoint.sh fans them out across provider lanes); historically each model
+// posted its OWN comment, so a reader faced N walls of prose that mostly agreed.
+//
+// Instead, every model writes its findings to a shared directory
+// (GADFLY_FINDINGS_OUT, one JSON file per model), and after the whole swarm
+// finishes a single consolidation pass (GADFLY_CONSOLIDATE_DIR) clusters those
+// findings by location, counts how many models independently flagged each one,
+// and renders ONE comment: an agreement-ranked table up top (cross-model
+// agreement is the strongest real-vs-false-positive signal we have), with each
+// model's full review folded below for drill-down.
+//
+// This file owns: the per-model artifact (modelFindings), writing it
+// (writeFindingsOut), reading the directory back, clustering, and rendering the
+// consensus markdown (renderConsensus). It depends only on the stdlib.
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// consensusMarker tags the single consolidated comment so entrypoint.sh can
+// upsert it in place across re-runs (mirrors run.sh's per-model marker).
+const consensusMarker = "<!-- gadfly-consensus -->"
+
+// modelFindings is the per-model artifact written to GADFLY_FINDINGS_OUT. It
+// carries enough to rebuild the consolidated comment without re-running models:
+// the structured findings (for clustering) plus the full rendered review (for
+// the folded per-model drill-down).
+type modelFindings struct {
+	Model    string       `json:"model"`
+	Provider string       `json:"provider"`
+	Verdict  string       `json:"verdict"`  // worst lens verdict, as a label
+	Errored  bool         `json:"errored"`  // the model produced no usable review (every lens failed, or the run crashed)
+	Markdown string       `json:"markdown"` // full rendered per-model review (findings block already stripped)
+	Findings []outFinding `json:"findings"`
+}
+
+// outFinding is one finding in the per-model artifact (a flattened `finding`
+// plus its lens).
+type outFinding struct {
+	Lens       string `json:"lens"`
+	File       string `json:"file"`
+	Line       int    `json:"line"`
+	Severity   string `json:"severity"`
+	Confidence string `json:"confidence"`
+	Title      string `json:"title"`
+	Detail     string `json:"detail"`
+}
+
+// collectFindings returns a lens result's findings with severity always filled
+// in: per-finding when the structured block supplied it, else derived from the
+// lens verdict (so heuristic-scraped findings still carry a canonical word). A
+// clean or errored lens yields nothing. Shared by the telemetry emit and the
+// per-model findings file so both agree on what a finding is.
+func collectFindings(r specialistResult) []finding {
+	if r.errored || r.verdict == verdictClean {
+		return nil
+	}
+	fs := extractStructuredFindingsOrScrape(r)
+	lensSev := r.verdict.severity()
+	for i := range fs {
+		if fs[i].severity == "" {
+			fs[i].severity = lensSev
+		}
+	}
+	return fs
+}
+
+// writeFindingsOut writes this model's findings + rendered review to
+// GADFLY_FINDINGS_OUT for the later consolidation pass. No-op unless the env is
+// set. Best-effort: any error is logged to stderr and never affects the review
+// (it runs after the markdown is already on stdout).
+func writeFindingsOut(results []specialistResult) {
+	path := strings.TrimSpace(os.Getenv("GADFLY_FINDINGS_OUT"))
+	if path == "" {
+		return
+	}
+	mf := modelFindings{
+		Model:    strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
+		Provider: modelProvider(),
+		Verdict:  worstVerdict(results).label(),
+		Errored:  allErrored(results),
+		Markdown: renderConsolidated(results),
+	}
+	for _, r := range results {
+		for _, f := range collectFindings(r) {
+			mf.Findings = append(mf.Findings, outFinding{
+				Lens:       r.spec.Name,
+				File:       f.file,
+				Line:       f.line,
+				Severity:   f.severity,
+				Confidence: f.confidence,
+				Title:      f.title,
+				Detail:     f.detail,
+			})
+		}
+	}
+	data, err := json.Marshal(mf)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
+		return
+	}
+	// Defensive: make sure the parent dir exists (entrypoint creates it, but a
+	// missing dir would otherwise silently drop this model from the consensus).
+	if dir := filepath.Dir(path); dir != "" {
+		_ = os.MkdirAll(dir, 0o755)
+	}
+	if err := os.WriteFile(path, data, 0o644); err != nil {
+		fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
+	}
+}
+
+// allErrored reports whether every lens of a review failed (so the model
+// produced no usable findings). Such a model is recorded but excluded from the
+// consensus agreement denominator — counting it would dilute every ratio with a
+// model that never actually reviewed.
+func allErrored(results []specialistResult) bool {
+	if len(results) == 0 {
+		return true
+	}
+	for _, r := range results {
+		if !r.errored {
+			return false
+		}
+	}
+	return true
+}
+
+// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
+// read every per-model artifact in the directory, render the consensus comment
+// to stdout. Errors are fatal to THIS process only — entrypoint.sh treats a
+// failed consolidation as advisory and falls back to per-model comments.
+func runConsolidate() error {
+	dir := strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR"))
+	if dir == "" {
+		return errors.New("GADFLY_CONSOLIDATE_DIR is empty")
+	}
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return fmt.Errorf("read consolidate dir: %w", err)
+	}
+	var models []modelFindings
+	for _, e := range entries {
+		if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+			continue
+		}
+		data, err := os.ReadFile(filepath.Join(dir, e.Name()))
+		if err != nil {
+			fmt.Fprintln(os.Stderr, "gadfly: read", e.Name(), err)
+			continue
+		}
+		var mf modelFindings
+		if err := json.Unmarshal(data, &mf); err != nil {
+			fmt.Fprintln(os.Stderr, "gadfly: parse", e.Name(), err)
+			continue
+		}
+		if strings.TrimSpace(mf.Model) == "" {
+			continue
+		}
+		models = append(models, mf)
+	}
+	if len(models) == 0 {
+		return errors.New("no model findings to consolidate")
+	}
+	// Lead with the marker so entrypoint.sh can upsert this comment in place
+	// (same pattern as run.sh's per-model marker); it appends the advisory footer.
+	fmt.Println(consensusMarker)
+	fmt.Println(renderConsensus(models))
+	return nil
+}
+
+// cluster is a group of findings (across models) judged to be the same issue:
+// same file, lines within lineTolerance of the cluster's current span. The span
+// [line,maxLine] slides as members join, so a chain of nearby findings merges
+// instead of splitting once it drifts past the first line.
+type cluster struct {
+	file     string
+	line     int // representative (smallest) line
+	maxLine  int // largest line in the cluster — the span's upper edge
+	severity string
+	title    string
+	models   map[string]bool
+	lenses   map[string]bool
+}
+
+// findingRef is one model's finding (carrying which model reported it), used
+// while grouping findings into clusters.
+type findingRef struct {
+	f     outFinding
+	model string
+}
+
+// lineTolerance: a finding in the same file within this many lines of a
+// cluster's current span is treated as the same issue (models often cite a line
+// or two apart).
+const lineTolerance = 3
+
+// renderConsensus builds the single consolidated comment body from every model's
+// findings. It does NOT emit the marker or advisory footer — entrypoint.sh wraps
+// it (mirroring run.sh's per-model framing).
+func renderConsensus(models []modelFindings) string {
+	// effective = models that actually produced a review. Errored models are
+	// shown (folded, below) but excluded from the agreement denominator so a
+	// failed model doesn't dilute every ratio.
+	effective := 0
+	for _, m := range models {
+		if !m.Errored {
+			effective++
+		}
+	}
+	errored := len(models) - effective
+	clusters := clusterFindings(models)
+
+	worst := verdictClean
+	for _, m := range models {
+		if v := parseVerdict(m.Verdict); v > worst {
+			worst = v
+		}
+	}
+
+	// Partition in one pass: "headline" findings (multi-model agreement, OR a
+	// lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
+	// count multi-model agreements for the summary line.
+	var headline, folded []cluster
+	agreed := 0
+	for _, c := range clusters {
+		if len(c.models) >= 2 {
+			agreed++
+		}
+		if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
+			headline = append(headline, c)
+		} else {
+			folded = append(folded, c)
+		}
+	}
+
+	var b strings.Builder
+	fmt.Fprintf(&b, "## 🪰 Gadfly review — consensus across %d model%s", effective, plural(effective))
+	if errored > 0 {
+		fmt.Fprintf(&b, " (%d failed)", errored)
+	}
+	b.WriteString("\n\n")
+	fmt.Fprintf(&b, "**Verdict: %s** · %d finding%s (%d with multi-model agreement)\n",
+		worst.label(), len(clusters), plural(len(clusters)), agreed)
+
+	if len(headline) > 0 {
+		b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
+		for _, c := range headline {
+			fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
+				sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
+				len(c.models), effective, mdCell(lensList(c.lenses)))
+		}
+	} else if len(clusters) == 0 {
+		b.WriteString("\nNo material issues found by consensus.\n")
+	}
+	// else: only single-model findings — they're shown folded below, so don't
+	// claim "no material issues" (there are some, just none with consensus).
+
+	if len(folded) > 0 {
+		fmt.Fprintf(&b, "\n<details><summary>%d single-model finding%s (lower confidence)</summary>\n\n",
+			len(folded), plural(len(folded)))
+		b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
+		for _, c := range folded {
+			fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
+				sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
+				mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
+		}
+		b.WriteString("\n</details>\n")
+	}
+
+	// Per-model full reviews, folded for drill-down (nothing is lost).
+	b.WriteString("\n<details><summary>Per-model detail</summary>\n")
+	for _, m := range models {
+		body := strings.TrimSpace(m.Markdown)
+		if body == "" {
+			body = "_(no output)_"
+		}
+		verdict := m.Verdict
+		if m.Errored {
+			verdict = "⚠️ reviewer failed"
+		}
+		fmt.Fprintf(&b, "\n<details><summary><b>%s</b> (%s) — %s</summary>\n\n%s\n\n</details>\n",
+			mdCell(m.Model), mdCell(m.Provider), verdict, body)
+	}
+	b.WriteString("\n</details>")
+	return b.String()
+}
+
+// clusterFindings groups every model's findings into cross-model clusters,
+// sorted by agreement (desc), then severity (desc), then location.
+func clusterFindings(models []modelFindings) []cluster {
+	// Group by file, then greedily merge by line proximity.
+	byFile := map[string][]findingRef{}
+	for _, m := range models {
+		for _, f := range m.Findings {
+			if strings.TrimSpace(f.File) == "" {
+				continue
+			}
+			byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
+		}
+	}
+
+	var clusters []cluster
+	for file, items := range byFile {
+		sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
+		// Cluster within THIS file only (clusters never span files), so the inner
+		// scan is over same-file clusters, not every cluster seen so far.
+		var fileClusters []cluster
+		for _, it := range items {
+			placed := false
+			for ci := range fileClusters {
+				c := &fileClusters[ci]
+				// Join if the line falls within the cluster's span, widened by the
+				// tolerance on both edges — so the window slides as the span grows.
+				if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
+					mergeIntoCluster(c, it.f, it.model)
+					placed = true
+					break
+				}
+			}
+			if !placed {
+				c := cluster{
+					file:     file,
+					line:     it.f.Line,
+					maxLine:  it.f.Line,
+					severity: it.f.Severity,
+					title:    it.f.Title,
+					models:   map[string]bool{},
+					lenses:   map[string]bool{},
+				}
+				mergeIntoCluster(&c, it.f, it.model)
+				fileClusters = append(fileClusters, c)
+			}
+		}
+		clusters = append(clusters, fileClusters...)
+	}
+
+	sort.SliceStable(clusters, func(i, j int) bool {
+		if len(clusters[i].models) != len(clusters[j].models) {
+			return len(clusters[i].models) > len(clusters[j].models)
+		}
+		if sevRank(clusters[i].severity) != sevRank(clusters[j].severity) {
+			return sevRank(clusters[i].severity) > sevRank(clusters[j].severity)
+		}
+		if clusters[i].file != clusters[j].file {
+			return clusters[i].file < clusters[j].file
+		}
+		return clusters[i].line < clusters[j].line
+	})
+	return clusters
+}
+
+// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
+// widen the [line,maxLine] span, and keep the highest-severity report's title.
+func mergeIntoCluster(c *cluster, f outFinding, model string) {
+	if model != "" {
+		c.models[model] = true
+	}
+	if f.Lens != "" {
+		c.lenses[f.Lens] = true
+	}
+	if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
+		c.line = f.Line
+	}
+	if f.Line > c.maxLine {
+		c.maxLine = f.Line
+	}
+	if sevRank(f.Severity) > sevRank(c.severity) {
+		c.severity = f.Severity
+		if strings.TrimSpace(f.Title) != "" {
+			c.title = f.Title
+		}
+	}
+}
+
+// sevRank orders the canonical severity words for sorting/comparison.
+func sevRank(s string) int {
+	switch strings.ToLower(strings.TrimSpace(s)) {
+	case "critical":
+		return 5
+	case "high":
+		return 4
+	case "medium":
+		return 3
+	case "small":
+		return 2
+	case "trivial":
+		return 1
+	default:
+		return 0
+	}
+}
+
+// sevIcon is the at-a-glance severity badge for the consensus table.
+func sevIcon(s string) string {
+	switch strings.ToLower(strings.TrimSpace(s)) {
+	case "critical", "high":
+		return "🔴"
+	case "medium":
+		return "🟠"
+	case "small":
+		return "🟡"
+	default:
+		return "⚪"
+	}
+}
+
+func location(file string, line int) string {
+	if line > 0 {
+		return fmt.Sprintf("%s:%d", file, line)
+	}
+	return file
+}
+
+func lensList(lenses map[string]bool) string {
+	out := make([]string, 0, len(lenses))
+	for l := range lenses {
+		out = append(out, l)
+	}
+	sort.Strings(out)
+	return strings.Join(out, ", ")
+}
+
+func oneModel(models map[string]bool) string {
+	for m := range models {
+		return m
+	}
+	return ""
+}
+
+// mdCell makes a string safe for a one-line markdown table cell: collapse
+// newlines, escape pipes (which delimit columns), and neutralize backticks
+// (a stray one would break an inline-code span — a backslash can't escape it
+// inside code, so replace with an apostrophe). Inputs are model-influenced, so
+// this keeps a malformed file path or title from breaking the table.
+func mdCell(s string) string {
+	s = strings.ReplaceAll(s, "\n", " ")
+	s = strings.ReplaceAll(s, "|", "\\|")
+	s = strings.ReplaceAll(s, "`", "'")
+	return strings.TrimSpace(s)
+}
+
+func plural(n int) string {
+	if n == 1 {
+		return ""
+	}
+	return "s"
+}
@@ -0,0 +1,217 @@
+package main
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestClusterFindingsAgreementAndTolerance(t *testing.T) {
+	models := []modelFindings{
+		{Model: "m1", Verdict: "Blocking issues found", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+			{Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"},
+		}},
+		{Model: "m2", Verdict: "Minor issues", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 11, Severity: "critical", Title: "auth bypass (crit)"}, // within tolerance of a.go:10
+		}},
+		{Model: "m3", Verdict: "Minor issues", Findings: []outFinding{
+			{Lens: "correctness", File: "a.go", Line: 10, Severity: "medium", Title: "auth bypass"},
+		}},
+	}
+	clusters := clusterFindings(models)
+	if len(clusters) != 2 {
+		t.Fatalf("want 2 clusters (a.go:10±, b.go:5), got %d: %+v", len(clusters), clusters)
+	}
+	// First cluster (highest agreement) is the a.go auth one: 3 models, severity
+	// escalated to critical, representative line the smallest (10).
+	c := clusters[0]
+	if len(c.models) != 3 {
+		t.Errorf("want 3 models on the top cluster, got %d", len(c.models))
+	}
+	if c.severity != "critical" {
+		t.Errorf("want escalated severity critical, got %q", c.severity)
+	}
+	if c.line != 10 {
+		t.Errorf("want representative line 10, got %d", c.line)
+	}
+	if !c.lenses["security"] || !c.lenses["correctness"] {
+		t.Errorf("want union of lenses, got %v", c.lenses)
+	}
+}
+
+func TestRenderConsensusFoldsSingleModelNits(t *testing.T) {
+	models := []modelFindings{
+		{Model: "m1", Provider: "p", Verdict: "Blocking issues found", Markdown: "m1 detail", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+			{Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"},
+		}},
+		{Model: "m2", Provider: "p", Verdict: "Minor issues", Markdown: "m2 detail", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"},
+		}},
+	}
+	out := renderConsensus(models)
+	// Headline table: the agreed finding with a 2/2 badge.
+	if !strings.Contains(out, "2/2") {
+		t.Errorf("expected a 2/2 agreement badge in headline:\n%s", out)
+	}
+	if !strings.Contains(out, "auth bypass") || !strings.Contains(out, "a.go:10") {
+		t.Errorf("headline missing the consensus finding:\n%s", out)
+	}
+	// The lone trivial finding is folded, not in the headline table.
+	if !strings.Contains(out, "single-model finding") {
+		t.Errorf("expected a folded single-model section:\n%s", out)
+	}
+	// Per-model detail is preserved (folded).
+	if !strings.Contains(out, "m1 detail") || !strings.Contains(out, "m2 detail") {
+		t.Errorf("per-model detail not preserved:\n%s", out)
+	}
+}
+
+func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) {
+	// A single model, single critical finding must still surface in the headline
+	// (not be folded as "low confidence").
+	models := []modelFindings{
+		{Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 1, Severity: "critical", Title: "rce"},
+		}},
+	}
+	out := renderConsensus(models)
+	headline := out
+	if i := strings.Index(out, "single-model finding"); i >= 0 {
+		headline = out[:i]
+	}
+	if !strings.Contains(headline, "rce") {
+		t.Errorf("lone critical should be in the headline, not folded:\n%s", out)
+	}
+}
+
+func TestClusterSlidingWindowMergesChain(t *testing.T) {
+	// Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE
+	// cluster — the window slides with the span instead of anchoring at line 10.
+	models := []modelFindings{
+		{Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}},
+		{Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}},
+		{Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}},
+	}
+	clusters := clusterFindings(models)
+	if len(clusters) != 1 {
+		t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters))
+	}
+	if len(clusters[0].models) != 3 {
+		t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models))
+	}
+}
+
+func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) {
+	models := []modelFindings{
+		{Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+		{Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+		{Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"},
+	}
+	out := renderConsensus(models)
+	// Denominator is the 2 effective models, not 3; the failure is noted.
+	if !strings.Contains(out, "2/2") {
+		t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out)
+	}
+	if !strings.Contains(out, "1 failed") {
+		t.Errorf("expected a '1 failed' note:\n%s", out)
+	}
+	if !strings.Contains(out, "reviewer failed") {
+		t.Errorf("errored model should still appear (folded) as failed:\n%s", out)
+	}
+}
+
+func TestRenderConsensusLoneHighFolds(t *testing.T) {
+	// A single-model HIGH (not critical) folds — only consensus or a lone CRITICAL
+	// earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise.
+	models := []modelFindings{
+		{Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
+			{Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}},
+	}
+	out := renderConsensus(models)
+	head := out
+	if i := strings.Index(out, "single-model finding"); i >= 0 {
+		head = out[:i]
+	}
+	if strings.Contains(head, "maybe-bug") {
+		t.Errorf("a lone HIGH should fold, not headline:\n%s", out)
+	}
+}
+
+func TestWriteAndConsolidateRoundTrip(t *testing.T) {
+	dir := t.TempDir()
+
+	// Two model artifacts on disk.
+	write := func(name string, mf modelFindings) {
+		data, _ := json.Marshal(mf)
+		if err := os.WriteFile(filepath.Join(dir, name), data, 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+	write("m1.json", modelFindings{Model: "m1", Provider: "ollama", Verdict: "Minor issues", Markdown: "md1",
+		Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "medium", Title: "leak"}}})
+	write("m2.json", modelFindings{Model: "m2", Provider: "ollama", Verdict: "Minor issues", Markdown: "md2",
+		Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "high", Title: "leak"}}})
+	// A junk file must be skipped, not crash consolidation.
+	if err := os.WriteFile(filepath.Join(dir, "notes.txt"), []byte("ignore me"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	t.Setenv("GADFLY_CONSOLIDATE_DIR", dir)
+	// runConsolidate prints to stdout; capture it.
+	out := captureStdout(t, func() {
+		if err := runConsolidate(); err != nil {
+			t.Fatalf("runConsolidate: %v", err)
+		}
+	})
+	if !strings.HasPrefix(strings.TrimSpace(out), consensusMarker) {
+		t.Errorf("consolidated output must lead with the marker:\n%s", out)
+	}
+	if !strings.Contains(out, "2/2") || !strings.Contains(out, "x.go:3") {
+		t.Errorf("expected the agreed x.go:3 finding at 2/2:\n%s", out)
+	}
+}
+
+func TestRunConsolidateEmptyDirErrors(t *testing.T) {
+	t.Setenv("GADFLY_CONSOLIDATE_DIR", t.TempDir())
+	if err := runConsolidate(); err == nil {
+		t.Error("want an error for an empty consolidate dir (entrypoint falls back)")
+	}
+}
+
+// captureStdout redirects os.Stdout for the duration of fn and returns what was
+// written.
+func captureStdout(t *testing.T, fn func()) string {
+	t.Helper()
+	orig := os.Stdout
+	r, w, err := os.Pipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	os.Stdout = w
+	defer func() { os.Stdout = orig }() // restore even if fn panics
+	done := make(chan string)
+	go func() {
+		var sb strings.Builder
+		buf := make([]byte, 4096)
+		for {
+			n, err := r.Read(buf)
+			if n > 0 {
+				sb.Write(buf[:n])
+			}
+			if err != nil {
+				break
+			}
+		}
+		r.Close()
+		done <- sb.String()
+	}()
+	fn()
+	w.Close()
+	return <-done
+}
@@ -125,32 +125,12 @@ func emit(results []specialistResult, elapsed time.Duration) {
 		// InputTokens/OutputTokens/CostUSD stay nil -> JSON null (not metered).
 	}

+	// collectFindings (consensus.go) skips clean/errored lenses and fills in each
+	// finding's severity (per-finding when structured, else derived from the lens
+	// verdict), so every reportPayload carries a canonical raw_severity.
 	var reports []reportPayload
 	for _, r := range results {
-		if r.errored {
-			continue // a failed lens contributes no findings
-		}
-		// A lens that reports "No material issues found" has nothing to flag —
-		// its path:line references are verification notes ("verified X at
-		// file:line is safe"), not problems. Extracting them pollutes the
-		// findings store with false positives and unfairly penalizes thorough
-		// reviewers that do clean passes, so a clean lens emits no findings.
-		if r.verdict == verdictClean {
-			continue
-		}
-		// Prefer the model's machine-readable ```gadfly-findings block (exact
-		// location + per-finding severity). Fall back to the prose scrape when it
-		// yields NOTHING — no block, an unterminated/malformed one, or a block that
-		// parsed to zero usable findings on a non-clean lens (e.g. an empty []
-		// alongside prose findings). Falling back on empty, not just on absent,
-		// keeps the safety net from being defeated by a contradictory empty block.
-		fs := extractStructuredFindingsOrScrape(r)
-		lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity)
-		for _, f := range fs {
-			sev := f.severity
-			if sev == "" {
-				sev = lensSev
-			}
+		for _, f := range collectFindings(r) {
 			reports = append(reports, reportPayload{
 				Repo:        repo,
 				PR:          pr,
@@ -161,7 +141,7 @@ func emit(results []specialistResult, elapsed time.Duration) {
 				Model:       model,
 				Provider:    provider,
 				RunID:       runID,
-				RawSeverity: sev,
+				RawSeverity: f.severity,
 				Confidence:  f.confidence,
 				Detail:      f.detail,
 			})
@@ -123,6 +123,14 @@ func main() {

 func run() error {
 	start := time.Now()
+
+	// Consolidation mode: not a review at all — read the per-model findings the
+	// swarm wrote and print the single cross-model consensus comment. entrypoint.sh
+	// runs this once, after every model has finished.
+	if strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR")) != "" {
+		return runConsolidate()
+	}
+
 	repoDir := os.Getenv("GADFLY_REPO_DIR")
 	diffFile := os.Getenv("GADFLY_DIFF_FILE")
 	systemFile := os.Getenv("GADFLY_SYSTEM_FILE")
@@ -214,6 +222,11 @@ func run() error {
 	// Optional, best-effort telemetry. OFF unless GADFLY_FINDINGS_URL is set;
 	// any failure is logged to stderr and never affects stdout or the exit code.
 	emit(results, time.Since(start))
+
+	// Optional per-model findings artifact for the cross-model consolidation
+	// pass. No-op unless GADFLY_FINDINGS_OUT is set (entrypoint sets it for a
+	// multi-model swarm). Best-effort, never affects stdout or the exit code.
+	writeFindingsOut(results)
 	return nil
 }