diff --git a/cmd/gadfly/consensus.go b/cmd/gadfly/consensus.go
index ddb44d0..be04092 100644
--- a/cmd/gadfly/consensus.go
+++ b/cmd/gadfly/consensus.go
@@ -38,6 +38,7 @@ type modelFindings struct {
Model string `json:"model"`
Provider string `json:"provider"`
Verdict string `json:"verdict"` // worst lens verdict, as a label
+ Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed)
Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped)
Findings []outFinding `json:"findings"`
}
@@ -86,6 +87,7 @@ func writeFindingsOut(results []specialistResult) {
Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
Provider: modelProvider(),
Verdict: worstVerdict(results).label(),
+ Errored: allErrored(results),
Markdown: renderConsolidated(results),
}
for _, r := range results {
@@ -106,11 +108,32 @@ func writeFindingsOut(results []specialistResult) {
fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
return
}
+ // Defensive: make sure the parent dir exists (entrypoint creates it, but a
+ // missing dir would otherwise silently drop this model from the consensus).
+ if dir := filepath.Dir(path); dir != "" {
+ _ = os.MkdirAll(dir, 0o755)
+ }
if err := os.WriteFile(path, data, 0o644); err != nil {
fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
}
}
+// allErrored reports whether every lens of a review failed (so the model
+// produced no usable findings). Such a model is recorded but excluded from the
+// consensus agreement denominator โ counting it would dilute every ratio with a
+// model that never actually reviewed.
+func allErrored(results []specialistResult) bool {
+ if len(results) == 0 {
+ return true
+ }
+ for _, r := range results {
+ if !r.errored {
+ return false
+ }
+ }
+ return true
+}
+
// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
// read every per-model artifact in the directory, render the consensus comment
// to stdout. Errors are fatal to THIS process only โ entrypoint.sh treats a
@@ -155,26 +178,45 @@ func runConsolidate() error {
}
// cluster is a group of findings (across models) judged to be the same issue:
-// same file, lines within lineTolerance of each other.
+// same file, lines within lineTolerance of the cluster's current span. The span
+// [line,maxLine] slides as members join, so a chain of nearby findings merges
+// instead of splitting once it drifts past the first line.
type cluster struct {
file string
line int // representative (smallest) line
+ maxLine int // largest line in the cluster โ the span's upper edge
severity string
title string
- detail string
models map[string]bool
lenses map[string]bool
}
-// lineTolerance: findings in the same file within this many lines of each other
-// are treated as the same issue (models often cite a line or two apart).
+// findingRef is one model's finding (carrying which model reported it), used
+// while grouping findings into clusters.
+type findingRef struct {
+ f outFinding
+ model string
+}
+
+// lineTolerance: a finding in the same file within this many lines of a
+// cluster's current span is treated as the same issue (models often cite a line
+// or two apart).
const lineTolerance = 3
// renderConsensus builds the single consolidated comment body from every model's
// findings. It does NOT emit the marker or advisory footer โ entrypoint.sh wraps
// it (mirroring run.sh's per-model framing).
func renderConsensus(models []modelFindings) string {
- total := len(models)
+ // effective = models that actually produced a review. Errored models are
+ // shown (folded, below) but excluded from the agreement denominator so a
+ // failed model doesn't dilute every ratio.
+ effective := 0
+ for _, m := range models {
+ if !m.Errored {
+ effective++
+ }
+ }
+ errored := len(models) - effective
clusters := clusterFindings(models)
worst := verdictClean
@@ -184,11 +226,16 @@ func renderConsensus(models []modelFindings) string {
}
}
- // Partition: "headline" findings (multi-model agreement, OR a high/critical
- // even from one model) vs folded "single-model" lower-severity noise.
+ // Partition in one pass: "headline" findings (multi-model agreement, OR a
+ // lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
+ // count multi-model agreements for the summary line.
var headline, folded []cluster
+ agreed := 0
for _, c := range clusters {
- if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("high") {
+ if len(c.models) >= 2 {
+ agreed++
+ }
+ if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
headline = append(headline, c)
} else {
folded = append(folded, c)
@@ -196,13 +243,11 @@ func renderConsensus(models []modelFindings) string {
}
var b strings.Builder
- fmt.Fprintf(&b, "## ๐ชฐ Gadfly review โ consensus across %d model%s\n\n", total, plural(total))
- agreed := 0
- for _, c := range clusters {
- if len(c.models) >= 2 {
- agreed++
- }
+ fmt.Fprintf(&b, "## ๐ชฐ Gadfly review โ consensus across %d model%s", effective, plural(effective))
+ if errored > 0 {
+ fmt.Fprintf(&b, " (%d failed)", errored)
}
+ b.WriteString("\n\n")
fmt.Fprintf(&b, "**Verdict: %s** ยท %d finding%s (%d with multi-model agreement)\n",
worst.label(), len(clusters), plural(len(clusters)), agreed)
@@ -210,12 +255,14 @@ func renderConsensus(models []modelFindings) string {
b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
for _, c := range headline {
fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
- sevIcon(c.severity), mdCell(c.title), location(c.file, c.line),
- len(c.models), total, mdCell(lensList(c.lenses)))
+ sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
+ len(c.models), effective, mdCell(lensList(c.lenses)))
}
- } else {
+ } else if len(clusters) == 0 {
b.WriteString("\nNo material issues found by consensus.\n")
}
+ // else: only single-model findings โ they're shown folded below, so don't
+ // claim "no material issues" (there are some, just none with consensus).
if len(folded) > 0 {
fmt.Fprintf(&b, "\n%d single-model finding%s (lower confidence)
\n\n",
@@ -223,7 +270,7 @@ func renderConsensus(models []modelFindings) string {
b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
for _, c := range folded {
fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
- sevIcon(c.severity), mdCell(c.title), location(c.file, c.line),
+ sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
}
b.WriteString("\n \n")
@@ -236,8 +283,12 @@ func renderConsensus(models []modelFindings) string {
if body == "" {
body = "_(no output)_"
}
+ verdict := m.Verdict
+ if m.Errored {
+ verdict = "โ ๏ธ reviewer failed"
+ }
fmt.Fprintf(&b, "\n%s (%s) โ %s
\n\n%s\n\n \n",
- mdCell(m.Model), mdCell(m.Provider), m.Verdict, body)
+ mdCell(m.Model), mdCell(m.Provider), verdict, body)
}
b.WriteString("\n")
return b.String()
@@ -247,30 +298,29 @@ func renderConsensus(models []modelFindings) string {
// sorted by agreement (desc), then severity (desc), then location.
func clusterFindings(models []modelFindings) []cluster {
// Group by file, then greedily merge by line proximity.
- byFile := map[string][]struct {
- f outFinding
- model string
- }{}
+ byFile := map[string][]findingRef{}
for _, m := range models {
for _, f := range m.Findings {
if strings.TrimSpace(f.File) == "" {
continue
}
- byFile[f.File] = append(byFile[f.File], struct {
- f outFinding
- model string
- }{f, m.Model})
+ byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
}
}
var clusters []cluster
for file, items := range byFile {
sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
+ // Cluster within THIS file only (clusters never span files), so the inner
+ // scan is over same-file clusters, not every cluster seen so far.
+ var fileClusters []cluster
for _, it := range items {
placed := false
- for ci := range clusters {
- c := &clusters[ci]
- if c.file == file && absInt(c.line-it.f.Line) <= lineTolerance {
+ for ci := range fileClusters {
+ c := &fileClusters[ci]
+ // Join if the line falls within the cluster's span, widened by the
+ // tolerance on both edges โ so the window slides as the span grows.
+ if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
mergeIntoCluster(c, it.f, it.model)
placed = true
break
@@ -280,16 +330,17 @@ func clusterFindings(models []modelFindings) []cluster {
c := cluster{
file: file,
line: it.f.Line,
+ maxLine: it.f.Line,
severity: it.f.Severity,
title: it.f.Title,
- detail: it.f.Detail,
models: map[string]bool{},
lenses: map[string]bool{},
}
mergeIntoCluster(&c, it.f, it.model)
- clusters = append(clusters, c)
+ fileClusters = append(fileClusters, c)
}
}
+ clusters = append(clusters, fileClusters...)
}
sort.SliceStable(clusters, func(i, j int) bool {
@@ -308,7 +359,7 @@ func clusterFindings(models []modelFindings) []cluster {
}
// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
-// keep the smallest line, and keep the highest-severity report's title/detail.
+// widen the [line,maxLine] span, and keep the highest-severity report's title.
func mergeIntoCluster(c *cluster, f outFinding, model string) {
if model != "" {
c.models[model] = true
@@ -319,14 +370,14 @@ func mergeIntoCluster(c *cluster, f outFinding, model string) {
if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
c.line = f.Line
}
+ if f.Line > c.maxLine {
+ c.maxLine = f.Line
+ }
if sevRank(f.Severity) > sevRank(c.severity) {
c.severity = f.Severity
if strings.TrimSpace(f.Title) != "" {
c.title = f.Title
}
- if strings.TrimSpace(f.Detail) != "" {
- c.detail = f.Detail
- }
}
}
@@ -385,11 +436,15 @@ func oneModel(models map[string]bool) string {
return ""
}
-// mdCell makes a string safe for a one-line markdown table cell: escape pipes,
-// collapse newlines.
+// mdCell makes a string safe for a one-line markdown table cell: collapse
+// newlines, escape pipes (which delimit columns), and neutralize backticks
+// (a stray one would break an inline-code span โ a backslash can't escape it
+// inside code, so replace with an apostrophe). Inputs are model-influenced, so
+// this keeps a malformed file path or title from breaking the table.
func mdCell(s string) string {
s = strings.ReplaceAll(s, "\n", " ")
s = strings.ReplaceAll(s, "|", "\\|")
+ s = strings.ReplaceAll(s, "`", "'")
return strings.TrimSpace(s)
}
@@ -399,10 +454,3 @@ func plural(n int) string {
}
return "s"
}
-
-func absInt(n int) int {
- if n < 0 {
- return -n
- }
- return n
-}
diff --git a/cmd/gadfly/consensus_test.go b/cmd/gadfly/consensus_test.go
index 20a496d..595d54b 100644
--- a/cmd/gadfly/consensus_test.go
+++ b/cmd/gadfly/consensus_test.go
@@ -88,6 +88,61 @@ func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) {
}
}
+func TestClusterSlidingWindowMergesChain(t *testing.T) {
+ // Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE
+ // cluster โ the window slides with the span instead of anchoring at line 10.
+ models := []modelFindings{
+ {Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}},
+ {Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}},
+ {Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}},
+ }
+ clusters := clusterFindings(models)
+ if len(clusters) != 1 {
+ t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters))
+ }
+ if len(clusters[0].models) != 3 {
+ t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models))
+ }
+}
+
+func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) {
+ models := []modelFindings{
+ {Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+ {Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
+ {Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"},
+ }
+ out := renderConsensus(models)
+ // Denominator is the 2 effective models, not 3; the failure is noted.
+ if !strings.Contains(out, "2/2") {
+ t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out)
+ }
+ if !strings.Contains(out, "1 failed") {
+ t.Errorf("expected a '1 failed' note:\n%s", out)
+ }
+ if !strings.Contains(out, "reviewer failed") {
+ t.Errorf("errored model should still appear (folded) as failed:\n%s", out)
+ }
+}
+
+func TestRenderConsensusLoneHighFolds(t *testing.T) {
+ // A single-model HIGH (not critical) folds โ only consensus or a lone CRITICAL
+ // earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise.
+ models := []modelFindings{
+ {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
+ {Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}},
+ }
+ out := renderConsensus(models)
+ head := out
+ if i := strings.Index(out, "single-model finding"); i >= 0 {
+ head = out[:i]
+ }
+ if strings.Contains(head, "maybe-bug") {
+ t.Errorf("a lone HIGH should fold, not headline:\n%s", out)
+ }
+}
+
func TestWriteAndConsolidateRoundTrip(t *testing.T) {
dir := t.TempDir()
@@ -139,6 +194,7 @@ func captureStdout(t *testing.T, fn func()) string {
t.Fatal(err)
}
os.Stdout = w
+ defer func() { os.Stdout = orig }() // restore even if fn panics
done := make(chan string)
go func() {
var sb strings.Builder
@@ -152,10 +208,10 @@ func captureStdout(t *testing.T, fn func()) string {
break
}
}
+ r.Close()
done <- sb.String()
}()
fn()
w.Close()
- os.Stdout = orig
return <-done
}
diff --git a/entrypoint.sh b/entrypoint.sh
index da812ac..46d4e76 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -235,7 +235,16 @@ case "${GADFLY_CONSOLIDATE:-auto}" in
0) CONSOLIDATE=0 ;;
*) [ "${#MODEL_LIST[@]}" -ge 2 ] && CONSOLIDATE=1 ;;
esac
-findings_file_for() { echo "${FINDINGS_DIR}/$(echo "$1" | tr -c '[:alnum:]._-' '_').json"; }
+# A model spec can contain '/' and ':' (e.g. claude-code/opus, qwen3:14b), so
+# sanitize to a flat filename โ but append a checksum of the raw spec so two
+# specs that sanitize the same (foo:bar vs foo/bar -> foo_bar) don't collide onto
+# one file and silently drop a model from the consensus.
+findings_file_for() {
+ local safe sum
+ safe="$(echo "$1" | tr -c '[:alnum:]._-' '_')"
+ sum="$(printf '%s' "$1" | cksum | cut -d' ' -f1)"
+ echo "${FINDINGS_DIR}/${safe}-${sum}.json"
+}
if [ "$CONSOLIDATE" = "1" ]; then
rm -rf "$FINDINGS_DIR"; mkdir -p "$FINDINGS_DIR"
log "consolidation ON: ${#MODEL_LIST[@]} models -> one consensus comment"
diff --git a/scripts/run.sh b/scripts/run.sh
index c692ef9..0a5fe49 100644
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -214,6 +214,16 @@ esac
# --- assemble + post final comment (with run time) --------------------------
ELAPSED="$(( $(date +%s) - START_TS ))"
DUR="$(fmt_duration "$ELAPSED")"
+
+# Consolidating: the binary writes its findings file on success. If it failed or
+# was skipped (no file, or an empty one), write a stub so this model still shows
+# up in the consensus (as failed) and an all-models-fail run still posts a
+# comment โ never silently drop a model or the whole review.
+if [ "$CONSOLIDATE" = "1" ] && [ -n "${GADFLY_FINDINGS_OUT:-}" ] && [ ! -s "${GADFLY_FINDINGS_OUT}" ]; then
+ jq -n --arg model "$MODEL" --arg provider "$MODEL_PROVIDER" --arg md "$REVIEW" \
+ '{model:$model, provider:$provider, verdict:"reviewer failed", errored:true, markdown:$md, findings:[]}' \
+ > "${GADFLY_FINDINGS_OUT}" 2>/dev/null || true
+fi
# When consolidating, the binary has written this model's findings to
# GADFLY_FINDINGS_OUT; the consensus comment is posted by entrypoint.sh after the
# whole swarm finishes, so this model posts no comment of its own.