From 88f74aa7681a38b6aad0d193d72d387eb36dc91f Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 28 Jun 2026 22:56:15 +0000 Subject: [PATCH] feat: cross-model consensus consolidation (one ranked comment, not N walls) (#17) Co-authored-by: Steve Dudenhoeffer Co-committed-by: Steve Dudenhoeffer --- .gitea/workflows/review-reusable.yml | 2 + README.md | 30 ++ cmd/gadfly/consensus.go | 456 +++++++++++++++++++++++++++ cmd/gadfly/consensus_test.go | 217 +++++++++++++ cmd/gadfly/emit.go | 30 +- cmd/gadfly/main.go | 13 + entrypoint.sh | 88 +++++- scripts/run.sh | 37 ++- 8 files changed, 841 insertions(+), 32 deletions(-) create mode 100644 cmd/gadfly/consensus.go create mode 100644 cmd/gadfly/consensus_test.go diff --git a/.gitea/workflows/review-reusable.yml b/.gitea/workflows/review-reusable.yml index b742aed..46ff3c1 100644 --- a/.gitea/workflows/review-reusable.yml +++ b/.gitea/workflows/review-reusable.yml @@ -60,6 +60,7 @@ on: worker_model: { type: string, default: "" } # GADFLY_WORKER_MODEL allowed_users: { type: string, default: "" } # GADFLY_ALLOWED_USERS (consumer-specific; set in your stub) trigger_phrase: { type: string, default: "" } # GADFLY_TRIGGER_PHRASE + consolidate: { type: string, default: "" } # GADFLY_CONSOLIDATE β€” "" => auto (one consensus comment for >=2 models); "0" => one comment per model # Job wall-clock cap. 90 as a default: the 5-lens suite across a slow lane # (claude-code with extended thinking) over two passes can run long. timeout_minutes: { type: number, default: 90 } @@ -143,3 +144,4 @@ jobs: GADFLY_WORKER_MODEL: ${{ inputs.worker_model }} GADFLY_ALLOWED_USERS: ${{ inputs.allowed_users }} GADFLY_TRIGGER_PHRASE: ${{ inputs.trigger_phrase }} + GADFLY_CONSOLIDATE: ${{ inputs.consolidate }} diff --git a/README.md b/README.md index e27388b..59ba535 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,35 @@ every `GADFLY_STATUS_POLL_SECS` (default 12s) until the swarm finishes. It's adv best-effort β€” the per-model findings comments are unaffected β€” and entirely separate from those. Turn it off with `GADFLY_STATUS_BOARD=0`. +### Consensus consolidation + +With **two or more models**, posting one comment each means a reader faces N walls of prose that +mostly agree. Instead Gadfly consolidates: every model writes its findings to a shared file, and +after the whole swarm finishes a single pass clusters those findings by location, counts **how +many models independently flagged each one**, and posts **one consensus comment**: + +``` +## πŸͺ° Gadfly review β€” consensus across 7 models + +**Verdict: Blocking issues found** Β· 9 findings (3 with multi-model agreement) + +| | Finding | Where | Models | Lens | +|--|--|--|--|--| +| πŸ”΄ | Auth bypass: token not verified | `auth/login.go:42` | 6/7 | security | +| 🟠 | Unbounded retry loop | `sync/worker.go:88` | 3/7 | error-handling | + +
4 single-model findings (lower confidence) …
+
Per-model detail … each model's full review, folded …
+``` + +Cross-model agreement is the strongest real-vs-false-positive signal available, so findings are +ranked by it (a lone low-severity finding folds away; a lone *critical* still surfaces). The +per-model comments are suppressed in this mode β€” each model's full review is preserved, folded, +inside the consensus comment β€” and nothing is lost: if consolidation can't run, Gadfly falls +back to posting the per-model comments. Controlled by `GADFLY_CONSOLIDATE`: `auto` (default β€” on +for β‰₯2 models), `1` (force on), `0` (force off, one comment per model). Single-model runs are +unaffected. + ### Triggers 1. A **new/reopened/ready** non-draft PR β€” automatic. @@ -362,6 +391,7 @@ The reviewer binary reads these (the stub/entrypoint set sane defaults): | `GADFLY_MAX_DIFF_CHARS` | 60000 | diff chars embedded in the prompt (full diff via `get_diff`) | | `GADFLY_STATUS_BOARD` | on | set `0` to disable the live status-board comment | | `GADFLY_STATUS_POLL_SECS` | 12 | how often the status board re-renders/upserts | +| `GADFLY_CONSOLIDATE` | `auto` | cross-model consensus comment: `auto` (on for β‰₯2 models), `1` (force on), `0` (off β€” one comment per model) | | `GADFLY_TRIGGER_PHRASE` | `@gadfly review` | comment phrase that re-triggers | | `GADFLY_ALLOWED_USERS` | *(collaborators)* | comma-separated allow-list for comment triggers | | `GADFLY_FINDINGS_URL` | β€” | gadfly-reports store base URL; set to enable findings telemetry (off when empty) | diff --git a/cmd/gadfly/consensus.go b/cmd/gadfly/consensus.go new file mode 100644 index 0000000..be04092 --- /dev/null +++ b/cmd/gadfly/consensus.go @@ -0,0 +1,456 @@ +package main + +// Cross-model consensus consolidation. The swarm runs each model independently +// (entrypoint.sh fans them out across provider lanes); historically each model +// posted its OWN comment, so a reader faced N walls of prose that mostly agreed. +// +// Instead, every model writes its findings to a shared directory +// (GADFLY_FINDINGS_OUT, one JSON file per model), and after the whole swarm +// finishes a single consolidation pass (GADFLY_CONSOLIDATE_DIR) clusters those +// findings by location, counts how many models independently flagged each one, +// and renders ONE comment: an agreement-ranked table up top (cross-model +// agreement is the strongest real-vs-false-positive signal we have), with each +// model's full review folded below for drill-down. +// +// This file owns: the per-model artifact (modelFindings), writing it +// (writeFindingsOut), reading the directory back, clustering, and rendering the +// consensus markdown (renderConsensus). It depends only on the stdlib. + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "sort" + "strings" +) + +// consensusMarker tags the single consolidated comment so entrypoint.sh can +// upsert it in place across re-runs (mirrors run.sh's per-model marker). +const consensusMarker = "" + +// modelFindings is the per-model artifact written to GADFLY_FINDINGS_OUT. It +// carries enough to rebuild the consolidated comment without re-running models: +// the structured findings (for clustering) plus the full rendered review (for +// the folded per-model drill-down). +type modelFindings struct { + Model string `json:"model"` + Provider string `json:"provider"` + Verdict string `json:"verdict"` // worst lens verdict, as a label + Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed) + Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped) + Findings []outFinding `json:"findings"` +} + +// outFinding is one finding in the per-model artifact (a flattened `finding` +// plus its lens). +type outFinding struct { + Lens string `json:"lens"` + File string `json:"file"` + Line int `json:"line"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Title string `json:"title"` + Detail string `json:"detail"` +} + +// collectFindings returns a lens result's findings with severity always filled +// in: per-finding when the structured block supplied it, else derived from the +// lens verdict (so heuristic-scraped findings still carry a canonical word). A +// clean or errored lens yields nothing. Shared by the telemetry emit and the +// per-model findings file so both agree on what a finding is. +func collectFindings(r specialistResult) []finding { + if r.errored || r.verdict == verdictClean { + return nil + } + fs := extractStructuredFindingsOrScrape(r) + lensSev := r.verdict.severity() + for i := range fs { + if fs[i].severity == "" { + fs[i].severity = lensSev + } + } + return fs +} + +// writeFindingsOut writes this model's findings + rendered review to +// GADFLY_FINDINGS_OUT for the later consolidation pass. No-op unless the env is +// set. Best-effort: any error is logged to stderr and never affects the review +// (it runs after the markdown is already on stdout). +func writeFindingsOut(results []specialistResult) { + path := strings.TrimSpace(os.Getenv("GADFLY_FINDINGS_OUT")) + if path == "" { + return + } + mf := modelFindings{ + Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")), + Provider: modelProvider(), + Verdict: worstVerdict(results).label(), + Errored: allErrored(results), + Markdown: renderConsolidated(results), + } + for _, r := range results { + for _, f := range collectFindings(r) { + mf.Findings = append(mf.Findings, outFinding{ + Lens: r.spec.Name, + File: f.file, + Line: f.line, + Severity: f.severity, + Confidence: f.confidence, + Title: f.title, + Detail: f.detail, + }) + } + } + data, err := json.Marshal(mf) + if err != nil { + fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err) + return + } + // Defensive: make sure the parent dir exists (entrypoint creates it, but a + // missing dir would otherwise silently drop this model from the consensus). + if dir := filepath.Dir(path); dir != "" { + _ = os.MkdirAll(dir, 0o755) + } + if err := os.WriteFile(path, data, 0o644); err != nil { + fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err) + } +} + +// allErrored reports whether every lens of a review failed (so the model +// produced no usable findings). Such a model is recorded but excluded from the +// consensus agreement denominator β€” counting it would dilute every ratio with a +// model that never actually reviewed. +func allErrored(results []specialistResult) bool { + if len(results) == 0 { + return true + } + for _, r := range results { + if !r.errored { + return false + } + } + return true +} + +// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set): +// read every per-model artifact in the directory, render the consensus comment +// to stdout. Errors are fatal to THIS process only β€” entrypoint.sh treats a +// failed consolidation as advisory and falls back to per-model comments. +func runConsolidate() error { + dir := strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR")) + if dir == "" { + return errors.New("GADFLY_CONSOLIDATE_DIR is empty") + } + entries, err := os.ReadDir(dir) + if err != nil { + return fmt.Errorf("read consolidate dir: %w", err) + } + var models []modelFindings + for _, e := range entries { + if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") { + continue + } + data, err := os.ReadFile(filepath.Join(dir, e.Name())) + if err != nil { + fmt.Fprintln(os.Stderr, "gadfly: read", e.Name(), err) + continue + } + var mf modelFindings + if err := json.Unmarshal(data, &mf); err != nil { + fmt.Fprintln(os.Stderr, "gadfly: parse", e.Name(), err) + continue + } + if strings.TrimSpace(mf.Model) == "" { + continue + } + models = append(models, mf) + } + if len(models) == 0 { + return errors.New("no model findings to consolidate") + } + // Lead with the marker so entrypoint.sh can upsert this comment in place + // (same pattern as run.sh's per-model marker); it appends the advisory footer. + fmt.Println(consensusMarker) + fmt.Println(renderConsensus(models)) + return nil +} + +// cluster is a group of findings (across models) judged to be the same issue: +// same file, lines within lineTolerance of the cluster's current span. The span +// [line,maxLine] slides as members join, so a chain of nearby findings merges +// instead of splitting once it drifts past the first line. +type cluster struct { + file string + line int // representative (smallest) line + maxLine int // largest line in the cluster β€” the span's upper edge + severity string + title string + models map[string]bool + lenses map[string]bool +} + +// findingRef is one model's finding (carrying which model reported it), used +// while grouping findings into clusters. +type findingRef struct { + f outFinding + model string +} + +// lineTolerance: a finding in the same file within this many lines of a +// cluster's current span is treated as the same issue (models often cite a line +// or two apart). +const lineTolerance = 3 + +// renderConsensus builds the single consolidated comment body from every model's +// findings. It does NOT emit the marker or advisory footer β€” entrypoint.sh wraps +// it (mirroring run.sh's per-model framing). +func renderConsensus(models []modelFindings) string { + // effective = models that actually produced a review. Errored models are + // shown (folded, below) but excluded from the agreement denominator so a + // failed model doesn't dilute every ratio. + effective := 0 + for _, m := range models { + if !m.Errored { + effective++ + } + } + errored := len(models) - effective + clusters := clusterFindings(models) + + worst := verdictClean + for _, m := range models { + if v := parseVerdict(m.Verdict); v > worst { + worst = v + } + } + + // Partition in one pass: "headline" findings (multi-model agreement, OR a + // lone CRITICAL) vs folded "single-model" lower-confidence findings. Also + // count multi-model agreements for the summary line. + var headline, folded []cluster + agreed := 0 + for _, c := range clusters { + if len(c.models) >= 2 { + agreed++ + } + if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") { + headline = append(headline, c) + } else { + folded = append(folded, c) + } + } + + var b strings.Builder + fmt.Fprintf(&b, "## πŸͺ° Gadfly review β€” consensus across %d model%s", effective, plural(effective)) + if errored > 0 { + fmt.Fprintf(&b, " (%d failed)", errored) + } + b.WriteString("\n\n") + fmt.Fprintf(&b, "**Verdict: %s** Β· %d finding%s (%d with multi-model agreement)\n", + worst.label(), len(clusters), plural(len(clusters)), agreed) + + if len(headline) > 0 { + b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n") + for _, c := range headline { + fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n", + sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)), + len(c.models), effective, mdCell(lensList(c.lenses))) + } + } else if len(clusters) == 0 { + b.WriteString("\nNo material issues found by consensus.\n") + } + // else: only single-model findings β€” they're shown folded below, so don't + // claim "no material issues" (there are some, just none with consensus). + + if len(folded) > 0 { + fmt.Fprintf(&b, "\n
%d single-model finding%s (lower confidence)\n\n", + len(folded), plural(len(folded))) + b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n") + for _, c := range folded { + fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n", + sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)), + mdCell(oneModel(c.models)), mdCell(lensList(c.lenses))) + } + b.WriteString("\n
\n") + } + + // Per-model full reviews, folded for drill-down (nothing is lost). + b.WriteString("\n
Per-model detail\n") + for _, m := range models { + body := strings.TrimSpace(m.Markdown) + if body == "" { + body = "_(no output)_" + } + verdict := m.Verdict + if m.Errored { + verdict = "⚠️ reviewer failed" + } + fmt.Fprintf(&b, "\n
%s (%s) β€” %s\n\n%s\n\n
\n", + mdCell(m.Model), mdCell(m.Provider), verdict, body) + } + b.WriteString("\n
") + return b.String() +} + +// clusterFindings groups every model's findings into cross-model clusters, +// sorted by agreement (desc), then severity (desc), then location. +func clusterFindings(models []modelFindings) []cluster { + // Group by file, then greedily merge by line proximity. + byFile := map[string][]findingRef{} + for _, m := range models { + for _, f := range m.Findings { + if strings.TrimSpace(f.File) == "" { + continue + } + byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model}) + } + } + + var clusters []cluster + for file, items := range byFile { + sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line }) + // Cluster within THIS file only (clusters never span files), so the inner + // scan is over same-file clusters, not every cluster seen so far. + var fileClusters []cluster + for _, it := range items { + placed := false + for ci := range fileClusters { + c := &fileClusters[ci] + // Join if the line falls within the cluster's span, widened by the + // tolerance on both edges β€” so the window slides as the span grows. + if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance { + mergeIntoCluster(c, it.f, it.model) + placed = true + break + } + } + if !placed { + c := cluster{ + file: file, + line: it.f.Line, + maxLine: it.f.Line, + severity: it.f.Severity, + title: it.f.Title, + models: map[string]bool{}, + lenses: map[string]bool{}, + } + mergeIntoCluster(&c, it.f, it.model) + fileClusters = append(fileClusters, c) + } + } + clusters = append(clusters, fileClusters...) + } + + sort.SliceStable(clusters, func(i, j int) bool { + if len(clusters[i].models) != len(clusters[j].models) { + return len(clusters[i].models) > len(clusters[j].models) + } + if sevRank(clusters[i].severity) != sevRank(clusters[j].severity) { + return sevRank(clusters[i].severity) > sevRank(clusters[j].severity) + } + if clusters[i].file != clusters[j].file { + return clusters[i].file < clusters[j].file + } + return clusters[i].line < clusters[j].line + }) + return clusters +} + +// mergeIntoCluster folds one finding into a cluster: union the model/lens sets, +// widen the [line,maxLine] span, and keep the highest-severity report's title. +func mergeIntoCluster(c *cluster, f outFinding, model string) { + if model != "" { + c.models[model] = true + } + if f.Lens != "" { + c.lenses[f.Lens] = true + } + if f.Line > 0 && (c.line == 0 || f.Line < c.line) { + c.line = f.Line + } + if f.Line > c.maxLine { + c.maxLine = f.Line + } + if sevRank(f.Severity) > sevRank(c.severity) { + c.severity = f.Severity + if strings.TrimSpace(f.Title) != "" { + c.title = f.Title + } + } +} + +// sevRank orders the canonical severity words for sorting/comparison. +func sevRank(s string) int { + switch strings.ToLower(strings.TrimSpace(s)) { + case "critical": + return 5 + case "high": + return 4 + case "medium": + return 3 + case "small": + return 2 + case "trivial": + return 1 + default: + return 0 + } +} + +// sevIcon is the at-a-glance severity badge for the consensus table. +func sevIcon(s string) string { + switch strings.ToLower(strings.TrimSpace(s)) { + case "critical", "high": + return "πŸ”΄" + case "medium": + return "🟠" + case "small": + return "🟑" + default: + return "βšͺ" + } +} + +func location(file string, line int) string { + if line > 0 { + return fmt.Sprintf("%s:%d", file, line) + } + return file +} + +func lensList(lenses map[string]bool) string { + out := make([]string, 0, len(lenses)) + for l := range lenses { + out = append(out, l) + } + sort.Strings(out) + return strings.Join(out, ", ") +} + +func oneModel(models map[string]bool) string { + for m := range models { + return m + } + return "" +} + +// mdCell makes a string safe for a one-line markdown table cell: collapse +// newlines, escape pipes (which delimit columns), and neutralize backticks +// (a stray one would break an inline-code span β€” a backslash can't escape it +// inside code, so replace with an apostrophe). Inputs are model-influenced, so +// this keeps a malformed file path or title from breaking the table. +func mdCell(s string) string { + s = strings.ReplaceAll(s, "\n", " ") + s = strings.ReplaceAll(s, "|", "\\|") + s = strings.ReplaceAll(s, "`", "'") + return strings.TrimSpace(s) +} + +func plural(n int) string { + if n == 1 { + return "" + } + return "s" +} diff --git a/cmd/gadfly/consensus_test.go b/cmd/gadfly/consensus_test.go new file mode 100644 index 0000000..595d54b --- /dev/null +++ b/cmd/gadfly/consensus_test.go @@ -0,0 +1,217 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestClusterFindingsAgreementAndTolerance(t *testing.T) { + models := []modelFindings{ + {Model: "m1", Verdict: "Blocking issues found", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"}, + {Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"}, + }}, + {Model: "m2", Verdict: "Minor issues", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 11, Severity: "critical", Title: "auth bypass (crit)"}, // within tolerance of a.go:10 + }}, + {Model: "m3", Verdict: "Minor issues", Findings: []outFinding{ + {Lens: "correctness", File: "a.go", Line: 10, Severity: "medium", Title: "auth bypass"}, + }}, + } + clusters := clusterFindings(models) + if len(clusters) != 2 { + t.Fatalf("want 2 clusters (a.go:10Β±, b.go:5), got %d: %+v", len(clusters), clusters) + } + // First cluster (highest agreement) is the a.go auth one: 3 models, severity + // escalated to critical, representative line the smallest (10). + c := clusters[0] + if len(c.models) != 3 { + t.Errorf("want 3 models on the top cluster, got %d", len(c.models)) + } + if c.severity != "critical" { + t.Errorf("want escalated severity critical, got %q", c.severity) + } + if c.line != 10 { + t.Errorf("want representative line 10, got %d", c.line) + } + if !c.lenses["security"] || !c.lenses["correctness"] { + t.Errorf("want union of lenses, got %v", c.lenses) + } +} + +func TestRenderConsensusFoldsSingleModelNits(t *testing.T) { + models := []modelFindings{ + {Model: "m1", Provider: "p", Verdict: "Blocking issues found", Markdown: "m1 detail", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"}, + {Lens: "perf", File: "b.go", Line: 5, Severity: "trivial", Title: "tiny nit"}, + }}, + {Model: "m2", Provider: "p", Verdict: "Minor issues", Markdown: "m2 detail", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 10, Severity: "high", Title: "auth bypass"}, + }}, + } + out := renderConsensus(models) + // Headline table: the agreed finding with a 2/2 badge. + if !strings.Contains(out, "2/2") { + t.Errorf("expected a 2/2 agreement badge in headline:\n%s", out) + } + if !strings.Contains(out, "auth bypass") || !strings.Contains(out, "a.go:10") { + t.Errorf("headline missing the consensus finding:\n%s", out) + } + // The lone trivial finding is folded, not in the headline table. + if !strings.Contains(out, "single-model finding") { + t.Errorf("expected a folded single-model section:\n%s", out) + } + // Per-model detail is preserved (folded). + if !strings.Contains(out, "m1 detail") || !strings.Contains(out, "m2 detail") { + t.Errorf("per-model detail not preserved:\n%s", out) + } +} + +func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) { + // A single model, single critical finding must still surface in the headline + // (not be folded as "low confidence"). + models := []modelFindings{ + {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 1, Severity: "critical", Title: "rce"}, + }}, + } + out := renderConsensus(models) + headline := out + if i := strings.Index(out, "single-model finding"); i >= 0 { + headline = out[:i] + } + if !strings.Contains(headline, "rce") { + t.Errorf("lone critical should be in the headline, not folded:\n%s", out) + } +} + +func TestClusterSlidingWindowMergesChain(t *testing.T) { + // Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE + // cluster β€” the window slides with the span instead of anchoring at line 10. + models := []modelFindings{ + {Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}}, + {Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}}, + {Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}}, + } + clusters := clusterFindings(models) + if len(clusters) != 1 { + t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters)) + } + if len(clusters[0].models) != 3 { + t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models)) + } +} + +func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) { + models := []modelFindings{ + {Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}}, + {Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}}, + {Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"}, + } + out := renderConsensus(models) + // Denominator is the 2 effective models, not 3; the failure is noted. + if !strings.Contains(out, "2/2") { + t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out) + } + if !strings.Contains(out, "1 failed") { + t.Errorf("expected a '1 failed' note:\n%s", out) + } + if !strings.Contains(out, "reviewer failed") { + t.Errorf("errored model should still appear (folded) as failed:\n%s", out) + } +} + +func TestRenderConsensusLoneHighFolds(t *testing.T) { + // A single-model HIGH (not critical) folds β€” only consensus or a lone CRITICAL + // earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise. + models := []modelFindings{ + {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}}, + } + out := renderConsensus(models) + head := out + if i := strings.Index(out, "single-model finding"); i >= 0 { + head = out[:i] + } + if strings.Contains(head, "maybe-bug") { + t.Errorf("a lone HIGH should fold, not headline:\n%s", out) + } +} + +func TestWriteAndConsolidateRoundTrip(t *testing.T) { + dir := t.TempDir() + + // Two model artifacts on disk. + write := func(name string, mf modelFindings) { + data, _ := json.Marshal(mf) + if err := os.WriteFile(filepath.Join(dir, name), data, 0o644); err != nil { + t.Fatal(err) + } + } + write("m1.json", modelFindings{Model: "m1", Provider: "ollama", Verdict: "Minor issues", Markdown: "md1", + Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "medium", Title: "leak"}}}) + write("m2.json", modelFindings{Model: "m2", Provider: "ollama", Verdict: "Minor issues", Markdown: "md2", + Findings: []outFinding{{Lens: "security", File: "x.go", Line: 3, Severity: "high", Title: "leak"}}}) + // A junk file must be skipped, not crash consolidation. + if err := os.WriteFile(filepath.Join(dir, "notes.txt"), []byte("ignore me"), 0o644); err != nil { + t.Fatal(err) + } + + t.Setenv("GADFLY_CONSOLIDATE_DIR", dir) + // runConsolidate prints to stdout; capture it. + out := captureStdout(t, func() { + if err := runConsolidate(); err != nil { + t.Fatalf("runConsolidate: %v", err) + } + }) + if !strings.HasPrefix(strings.TrimSpace(out), consensusMarker) { + t.Errorf("consolidated output must lead with the marker:\n%s", out) + } + if !strings.Contains(out, "2/2") || !strings.Contains(out, "x.go:3") { + t.Errorf("expected the agreed x.go:3 finding at 2/2:\n%s", out) + } +} + +func TestRunConsolidateEmptyDirErrors(t *testing.T) { + t.Setenv("GADFLY_CONSOLIDATE_DIR", t.TempDir()) + if err := runConsolidate(); err == nil { + t.Error("want an error for an empty consolidate dir (entrypoint falls back)") + } +} + +// captureStdout redirects os.Stdout for the duration of fn and returns what was +// written. +func captureStdout(t *testing.T, fn func()) string { + t.Helper() + orig := os.Stdout + r, w, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + os.Stdout = w + defer func() { os.Stdout = orig }() // restore even if fn panics + done := make(chan string) + go func() { + var sb strings.Builder + buf := make([]byte, 4096) + for { + n, err := r.Read(buf) + if n > 0 { + sb.Write(buf[:n]) + } + if err != nil { + break + } + } + r.Close() + done <- sb.String() + }() + fn() + w.Close() + return <-done +} diff --git a/cmd/gadfly/emit.go b/cmd/gadfly/emit.go index fbac59d..f1d76ba 100644 --- a/cmd/gadfly/emit.go +++ b/cmd/gadfly/emit.go @@ -125,32 +125,12 @@ func emit(results []specialistResult, elapsed time.Duration) { // InputTokens/OutputTokens/CostUSD stay nil -> JSON null (not metered). } + // collectFindings (consensus.go) skips clean/errored lenses and fills in each + // finding's severity (per-finding when structured, else derived from the lens + // verdict), so every reportPayload carries a canonical raw_severity. var reports []reportPayload for _, r := range results { - if r.errored { - continue // a failed lens contributes no findings - } - // A lens that reports "No material issues found" has nothing to flag β€” - // its path:line references are verification notes ("verified X at - // file:line is safe"), not problems. Extracting them pollutes the - // findings store with false positives and unfairly penalizes thorough - // reviewers that do clean passes, so a clean lens emits no findings. - if r.verdict == verdictClean { - continue - } - // Prefer the model's machine-readable ```gadfly-findings block (exact - // location + per-finding severity). Fall back to the prose scrape when it - // yields NOTHING β€” no block, an unterminated/malformed one, or a block that - // parsed to zero usable findings on a non-clean lens (e.g. an empty [] - // alongside prose findings). Falling back on empty, not just on absent, - // keeps the safety net from being defeated by a contradictory empty block. - fs := extractStructuredFindingsOrScrape(r) - lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity) - for _, f := range fs { - sev := f.severity - if sev == "" { - sev = lensSev - } + for _, f := range collectFindings(r) { reports = append(reports, reportPayload{ Repo: repo, PR: pr, @@ -161,7 +141,7 @@ func emit(results []specialistResult, elapsed time.Duration) { Model: model, Provider: provider, RunID: runID, - RawSeverity: sev, + RawSeverity: f.severity, Confidence: f.confidence, Detail: f.detail, }) diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index 20f3564..57943a1 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -123,6 +123,14 @@ func main() { func run() error { start := time.Now() + + // Consolidation mode: not a review at all β€” read the per-model findings the + // swarm wrote and print the single cross-model consensus comment. entrypoint.sh + // runs this once, after every model has finished. + if strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR")) != "" { + return runConsolidate() + } + repoDir := os.Getenv("GADFLY_REPO_DIR") diffFile := os.Getenv("GADFLY_DIFF_FILE") systemFile := os.Getenv("GADFLY_SYSTEM_FILE") @@ -214,6 +222,11 @@ func run() error { // Optional, best-effort telemetry. OFF unless GADFLY_FINDINGS_URL is set; // any failure is logged to stderr and never affects stdout or the exit code. emit(results, time.Since(start)) + + // Optional per-model findings artifact for the cross-model consolidation + // pass. No-op unless GADFLY_FINDINGS_OUT is set (entrypoint sets it for a + // multi-model swarm). Best-effort, never affects stdout or the exit code. + writeFindingsOut(results) return nil } diff --git a/entrypoint.sh b/entrypoint.sh index e15972c..46d4e76 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -45,6 +45,8 @@ # GADFLY_FINDINGS_URL optional gadfly-reports store base URL; set to POST the run + # findings for model-quality tracking (off when empty) # GADFLY_FINDINGS_TOKEN optional bearer token for the gadfly-reports store +# GADFLY_CONSOLIDATE cross-model consensus comment: "auto" (default; on for >=2 +# models), "1" force on, "0" force off (one comment per model) set -uo pipefail # One model by default: the specialist suite already provides breadth, so a @@ -64,6 +66,29 @@ die() { log "ERROR: $*"; exit 1; } API() { curl -fsS --connect-timeout 20 --max-time 30 -H "Authorization: token ${GITEA_TOKEN}" "$@"; } +# upsert_comment_body MARKER BODY β€” create or update (by leading MARKER) a single +# PR comment. Mirrors run.sh's per-model upsert; used for the consensus comment +# and the per-model fallback when consolidation is on. +upsert_comment_body() { + local marker="$1" body="$2" post_body existing_id="" page=1 cmts + post_body="$(jq -n --arg b "$body" '{body:$b}')" + while [ "$page" -le 10 ]; do + cmts="$(API "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" || echo '[]')" + [ "$(echo "$cmts" | jq 'length')" = "0" ] && break + existing_id="$(echo "$cmts" | jq -r --arg m "$marker" \ + '.[] | select(.body != null and (.body | startswith($m))) | .id' | head -n1)" + [ -n "$existing_id" ] && break + page=$((page+1)) + done + if [ -n "$existing_id" ]; then + curl -sS --connect-timeout 20 --max-time 30 -X PATCH -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" "${GITEA_API}/issues/comments/${existing_id}" -d "$post_body" >/dev/null + else + curl -sS --connect-timeout 20 --max-time 30 -X POST -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" "${GITEA_API}/issues/${PR}/comments" -d "$post_body" >/dev/null + fi +} + # --- is the commenter allowed to trigger a re-review? ---------------------- actor_allowed() { local actor="$1" @@ -176,10 +201,11 @@ provider_cap() { # provider -> concurrency (override map "p=N,...", else default } review_one() { - local sf="" + local sf="" ff="" [ "${GADFLY_STATUS_BOARD:-1}" != "0" ] && sf="$(status_file_for "$1")" + [ "$CONSOLIDATE" = "1" ] && ff="$(findings_file_for "$1")" PROVIDER=ollama MODEL="$1" GADFLY_BIN="/usr/local/bin/gadfly" GADFLY_REPO_DIR="$REPO_DIR" \ - GADFLY_STATUS_FILE="$sf" \ + GADFLY_STATUS_FILE="$sf" GADFLY_FINDINGS_OUT="$ff" GADFLY_CONSOLIDATE="$CONSOLIDATE" \ bash "${SCRIPTS_DIR}/run.sh" || log "model $1 failed (continuing)" # If the binary never wrote real status (run.sh skipped it: empty diff, no key, # binary missing), the pre-seed stays {started:0, done:false} and the board @@ -196,6 +222,34 @@ IFS=',' read -ra _raw <<< "$MODELS" || true MODEL_LIST=() for raw in "${_raw[@]}"; do m="$(echo "$raw" | tr -d '[:space:]')"; [ -n "$m" ] && MODEL_LIST+=("$m"); done +# --- cross-model consolidation decision ------------------------------------ +# With >=2 models, post ONE consensus comment (findings clustered + ranked by +# cross-model agreement) instead of N per-model walls of prose. Each model writes +# its findings to FINDINGS_DIR; a final pass (the binary in GADFLY_CONSOLIDATE_DIR +# mode) renders the consensus comment. GADFLY_CONSOLIDATE: "auto" (default; on for +# >=2 models), "1" force on, "0" force off (keep per-model comments). +FINDINGS_DIR="${WORKDIR}/findings" +CONSOLIDATE=0 +case "${GADFLY_CONSOLIDATE:-auto}" in + 1) CONSOLIDATE=1 ;; + 0) CONSOLIDATE=0 ;; + *) [ "${#MODEL_LIST[@]}" -ge 2 ] && CONSOLIDATE=1 ;; +esac +# A model spec can contain '/' and ':' (e.g. claude-code/opus, qwen3:14b), so +# sanitize to a flat filename β€” but append a checksum of the raw spec so two +# specs that sanitize the same (foo:bar vs foo/bar -> foo_bar) don't collide onto +# one file and silently drop a model from the consensus. +findings_file_for() { + local safe sum + safe="$(echo "$1" | tr -c '[:alnum:]._-' '_')" + sum="$(printf '%s' "$1" | cksum | cut -d' ' -f1)" + echo "${FINDINGS_DIR}/${safe}-${sum}.json" +} +if [ "$CONSOLIDATE" = "1" ]; then + rm -rf "$FINDINGS_DIR"; mkdir -p "$FINDINGS_DIR" + log "consolidation ON: ${#MODEL_LIST[@]} models -> one consensus comment" +fi + # Distinct providers, in first-seen order (no associative arrays β€” portable). PROVIDERS="" for m in "${MODEL_LIST[@]}"; do @@ -257,4 +311,34 @@ if [ -n "$BOARD_PID" ]; then touch "${STATUS_DIR}/.done" 2>/dev/null || true wait "$BOARD_PID" 2>/dev/null || true fi + +# --- cross-model consensus comment ----------------------------------------- +# Render ONE consensus comment from the per-model findings the swarm wrote. This +# is advisory and best-effort: if the consolidation pass produces nothing, fall +# back to posting each model's review as its own comment (the per-model comments +# were suppressed during the run), so a consolidation hiccup never loses output. +if [ "$CONSOLIDATE" = "1" ]; then + n_files="$(ls -1 "${FINDINGS_DIR}"/*.json 2>/dev/null | wc -l | tr -d '[:space:]')" + log "consolidating findings from ${n_files} model(s)" + CONSENSUS="$(GADFLY_CONSOLIDATE_DIR="$FINDINGS_DIR" /usr/local/bin/gadfly 2>"${WORKDIR}/consolidate.err" || true)" + if [ -n "$CONSENSUS" ]; then + BODY="$(printf '%s\n\nAutomated adversarial review by Gadfly β€” consensus across the model swarm. Advisory only β€” does not block merge.' "$CONSENSUS")" + upsert_comment_body "" "$BODY" + log "consensus comment posted" + else + log "consolidation produced no output; falling back to per-model comments" + log "$(tail -c 500 "${WORKDIR}/consolidate.err" 2>/dev/null)" + for f in "${FINDINGS_DIR}"/*.json; do + [ -f "$f" ] || continue + m="$(jq -r '.model // ""' "$f" 2>/dev/null)" + [ -z "$m" ] && continue + prov="$(jq -r '.provider // ""' "$f" 2>/dev/null)" + md="$(jq -r '.markdown // ""' "$f" 2>/dev/null)" + marker="" + body="$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β€” does not block merge.' \ + "$marker" "$m" "$prov" "$md")" + upsert_comment_body "$marker" "$body" + done + fi +fi log "done" diff --git a/scripts/run.sh b/scripts/run.sh index 8e9b396..0a5fe49 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -50,6 +50,13 @@ MAX_DIFF_CHARS="${MAX_DIFF_CHARS:-60000}" MARKER="" say() { echo "[gadfly-review:${PROVIDER}:${MODEL}] $*" >&2; } +# When the swarm is consolidating (GADFLY_CONSOLIDATE=1, set by entrypoint.sh for +# a multi-model run), this model does NOT post its own comment β€” it writes its +# findings to GADFLY_FINDINGS_OUT and a single cross-model consensus comment is +# posted after the whole swarm finishes. Live progress still shows on the status +# board. Default 0 (post a per-model comment, the standalone behavior). +CONSOLIDATE="${GADFLY_CONSOLIDATE:-0}" + # Display the model's ACTUAL backend: the provider segment of the spec # ("m1pro/qwen3.6:35b-mlx" -> "m1pro"); a bare id uses GADFLY_PROVIDER (default # ollama-cloud). This is what the comment header shows, not the run.sh lane. @@ -126,7 +133,9 @@ USR="$(printf 'PR #%s: %s\n\nDescription:\n%s\n\nUnified diff to review:\n```dif # --- announce start (placeholder comment) ----------------------------------- START_TS="$(date +%s)" say "starting review with ${MODEL}" -upsert_comment "$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n⏳ Reviewing… this comment will update with findings and run time.' \ +# Skip the per-model placeholder when consolidating (the consensus comment is +# posted later; live progress is on the status board). +[ "$CONSOLIDATE" = "1" ] || upsert_comment "$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n⏳ Reviewing… this comment will update with findings and run time.' \ "$MARKER" "$MODEL" "$MODEL_PROVIDER")" # --- call the model --------------------------------------------------------- @@ -170,6 +179,7 @@ case "$PROVIDER" in GADFLY_BODY="$BODY" \ GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \ GADFLY_STATUS_FILE="${GADFLY_STATUS_FILE:-}" \ + GADFLY_FINDINGS_OUT="${GADFLY_FINDINGS_OUT:-}" \ "$BIN" 2>"$ERR_FILE" )" rc=$? @@ -204,7 +214,24 @@ esac # --- assemble + post final comment (with run time) -------------------------- ELAPSED="$(( $(date +%s) - START_TS ))" DUR="$(fmt_duration "$ELAPSED")" -COMMENT="$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β€” does not block merge. Β· ⏱️ reviewed in %s' \ - "$MARKER" "$MODEL" "$MODEL_PROVIDER" "$REVIEW" "$DUR")" -upsert_comment "$COMMENT" -say "done in ${DUR}" + +# Consolidating: the binary writes its findings file on success. If it failed or +# was skipped (no file, or an empty one), write a stub so this model still shows +# up in the consensus (as failed) and an all-models-fail run still posts a +# comment β€” never silently drop a model or the whole review. +if [ "$CONSOLIDATE" = "1" ] && [ -n "${GADFLY_FINDINGS_OUT:-}" ] && [ ! -s "${GADFLY_FINDINGS_OUT}" ]; then + jq -n --arg model "$MODEL" --arg provider "$MODEL_PROVIDER" --arg md "$REVIEW" \ + '{model:$model, provider:$provider, verdict:"reviewer failed", errored:true, markdown:$md, findings:[]}' \ + > "${GADFLY_FINDINGS_OUT}" 2>/dev/null || true +fi +# When consolidating, the binary has written this model's findings to +# GADFLY_FINDINGS_OUT; the consensus comment is posted by entrypoint.sh after the +# whole swarm finishes, so this model posts no comment of its own. +if [ "$CONSOLIDATE" = "1" ]; then + say "done in ${DUR} (consolidated; no per-model comment)" +else + COMMENT="$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β€” does not block merge. Β· ⏱️ reviewed in %s' \ + "$MARKER" "$MODEL" "$MODEL_PROVIDER" "$REVIEW" "$DUR")" + upsert_comment "$COMMENT" + say "done in ${DUR}" +fi