diff --git a/cmd/gadfly/consensus.go b/cmd/gadfly/consensus.go index ddb44d0..be04092 100644 --- a/cmd/gadfly/consensus.go +++ b/cmd/gadfly/consensus.go @@ -38,6 +38,7 @@ type modelFindings struct { Model string `json:"model"` Provider string `json:"provider"` Verdict string `json:"verdict"` // worst lens verdict, as a label + Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed) Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped) Findings []outFinding `json:"findings"` } @@ -86,6 +87,7 @@ func writeFindingsOut(results []specialistResult) { Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")), Provider: modelProvider(), Verdict: worstVerdict(results).label(), + Errored: allErrored(results), Markdown: renderConsolidated(results), } for _, r := range results { @@ -106,11 +108,32 @@ func writeFindingsOut(results []specialistResult) { fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err) return } + // Defensive: make sure the parent dir exists (entrypoint creates it, but a + // missing dir would otherwise silently drop this model from the consensus). + if dir := filepath.Dir(path); dir != "" { + _ = os.MkdirAll(dir, 0o755) + } if err := os.WriteFile(path, data, 0o644); err != nil { fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err) } } +// allErrored reports whether every lens of a review failed (so the model +// produced no usable findings). Such a model is recorded but excluded from the +// consensus agreement denominator โ€” counting it would dilute every ratio with a +// model that never actually reviewed. +func allErrored(results []specialistResult) bool { + if len(results) == 0 { + return true + } + for _, r := range results { + if !r.errored { + return false + } + } + return true +} + // runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set): // read every per-model artifact in the directory, render the consensus comment // to stdout. Errors are fatal to THIS process only โ€” entrypoint.sh treats a @@ -155,26 +178,45 @@ func runConsolidate() error { } // cluster is a group of findings (across models) judged to be the same issue: -// same file, lines within lineTolerance of each other. +// same file, lines within lineTolerance of the cluster's current span. The span +// [line,maxLine] slides as members join, so a chain of nearby findings merges +// instead of splitting once it drifts past the first line. type cluster struct { file string line int // representative (smallest) line + maxLine int // largest line in the cluster โ€” the span's upper edge severity string title string - detail string models map[string]bool lenses map[string]bool } -// lineTolerance: findings in the same file within this many lines of each other -// are treated as the same issue (models often cite a line or two apart). +// findingRef is one model's finding (carrying which model reported it), used +// while grouping findings into clusters. +type findingRef struct { + f outFinding + model string +} + +// lineTolerance: a finding in the same file within this many lines of a +// cluster's current span is treated as the same issue (models often cite a line +// or two apart). const lineTolerance = 3 // renderConsensus builds the single consolidated comment body from every model's // findings. It does NOT emit the marker or advisory footer โ€” entrypoint.sh wraps // it (mirroring run.sh's per-model framing). func renderConsensus(models []modelFindings) string { - total := len(models) + // effective = models that actually produced a review. Errored models are + // shown (folded, below) but excluded from the agreement denominator so a + // failed model doesn't dilute every ratio. + effective := 0 + for _, m := range models { + if !m.Errored { + effective++ + } + } + errored := len(models) - effective clusters := clusterFindings(models) worst := verdictClean @@ -184,11 +226,16 @@ func renderConsensus(models []modelFindings) string { } } - // Partition: "headline" findings (multi-model agreement, OR a high/critical - // even from one model) vs folded "single-model" lower-severity noise. + // Partition in one pass: "headline" findings (multi-model agreement, OR a + // lone CRITICAL) vs folded "single-model" lower-confidence findings. Also + // count multi-model agreements for the summary line. var headline, folded []cluster + agreed := 0 for _, c := range clusters { - if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("high") { + if len(c.models) >= 2 { + agreed++ + } + if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") { headline = append(headline, c) } else { folded = append(folded, c) @@ -196,13 +243,11 @@ func renderConsensus(models []modelFindings) string { } var b strings.Builder - fmt.Fprintf(&b, "## ๐Ÿชฐ Gadfly review โ€” consensus across %d model%s\n\n", total, plural(total)) - agreed := 0 - for _, c := range clusters { - if len(c.models) >= 2 { - agreed++ - } + fmt.Fprintf(&b, "## ๐Ÿชฐ Gadfly review โ€” consensus across %d model%s", effective, plural(effective)) + if errored > 0 { + fmt.Fprintf(&b, " (%d failed)", errored) } + b.WriteString("\n\n") fmt.Fprintf(&b, "**Verdict: %s** ยท %d finding%s (%d with multi-model agreement)\n", worst.label(), len(clusters), plural(len(clusters)), agreed) @@ -210,12 +255,14 @@ func renderConsensus(models []modelFindings) string { b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n") for _, c := range headline { fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n", - sevIcon(c.severity), mdCell(c.title), location(c.file, c.line), - len(c.models), total, mdCell(lensList(c.lenses))) + sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)), + len(c.models), effective, mdCell(lensList(c.lenses))) } - } else { + } else if len(clusters) == 0 { b.WriteString("\nNo material issues found by consensus.\n") } + // else: only single-model findings โ€” they're shown folded below, so don't + // claim "no material issues" (there are some, just none with consensus). if len(folded) > 0 { fmt.Fprintf(&b, "\n
%d single-model finding%s (lower confidence)\n\n", @@ -223,7 +270,7 @@ func renderConsensus(models []modelFindings) string { b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n") for _, c := range folded { fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n", - sevIcon(c.severity), mdCell(c.title), location(c.file, c.line), + sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)), mdCell(oneModel(c.models)), mdCell(lensList(c.lenses))) } b.WriteString("\n
\n") @@ -236,8 +283,12 @@ func renderConsensus(models []modelFindings) string { if body == "" { body = "_(no output)_" } + verdict := m.Verdict + if m.Errored { + verdict = "โš ๏ธ reviewer failed" + } fmt.Fprintf(&b, "\n
%s (%s) โ€” %s\n\n%s\n\n
\n", - mdCell(m.Model), mdCell(m.Provider), m.Verdict, body) + mdCell(m.Model), mdCell(m.Provider), verdict, body) } b.WriteString("\n") return b.String() @@ -247,30 +298,29 @@ func renderConsensus(models []modelFindings) string { // sorted by agreement (desc), then severity (desc), then location. func clusterFindings(models []modelFindings) []cluster { // Group by file, then greedily merge by line proximity. - byFile := map[string][]struct { - f outFinding - model string - }{} + byFile := map[string][]findingRef{} for _, m := range models { for _, f := range m.Findings { if strings.TrimSpace(f.File) == "" { continue } - byFile[f.File] = append(byFile[f.File], struct { - f outFinding - model string - }{f, m.Model}) + byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model}) } } var clusters []cluster for file, items := range byFile { sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line }) + // Cluster within THIS file only (clusters never span files), so the inner + // scan is over same-file clusters, not every cluster seen so far. + var fileClusters []cluster for _, it := range items { placed := false - for ci := range clusters { - c := &clusters[ci] - if c.file == file && absInt(c.line-it.f.Line) <= lineTolerance { + for ci := range fileClusters { + c := &fileClusters[ci] + // Join if the line falls within the cluster's span, widened by the + // tolerance on both edges โ€” so the window slides as the span grows. + if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance { mergeIntoCluster(c, it.f, it.model) placed = true break @@ -280,16 +330,17 @@ func clusterFindings(models []modelFindings) []cluster { c := cluster{ file: file, line: it.f.Line, + maxLine: it.f.Line, severity: it.f.Severity, title: it.f.Title, - detail: it.f.Detail, models: map[string]bool{}, lenses: map[string]bool{}, } mergeIntoCluster(&c, it.f, it.model) - clusters = append(clusters, c) + fileClusters = append(fileClusters, c) } } + clusters = append(clusters, fileClusters...) } sort.SliceStable(clusters, func(i, j int) bool { @@ -308,7 +359,7 @@ func clusterFindings(models []modelFindings) []cluster { } // mergeIntoCluster folds one finding into a cluster: union the model/lens sets, -// keep the smallest line, and keep the highest-severity report's title/detail. +// widen the [line,maxLine] span, and keep the highest-severity report's title. func mergeIntoCluster(c *cluster, f outFinding, model string) { if model != "" { c.models[model] = true @@ -319,14 +370,14 @@ func mergeIntoCluster(c *cluster, f outFinding, model string) { if f.Line > 0 && (c.line == 0 || f.Line < c.line) { c.line = f.Line } + if f.Line > c.maxLine { + c.maxLine = f.Line + } if sevRank(f.Severity) > sevRank(c.severity) { c.severity = f.Severity if strings.TrimSpace(f.Title) != "" { c.title = f.Title } - if strings.TrimSpace(f.Detail) != "" { - c.detail = f.Detail - } } } @@ -385,11 +436,15 @@ func oneModel(models map[string]bool) string { return "" } -// mdCell makes a string safe for a one-line markdown table cell: escape pipes, -// collapse newlines. +// mdCell makes a string safe for a one-line markdown table cell: collapse +// newlines, escape pipes (which delimit columns), and neutralize backticks +// (a stray one would break an inline-code span โ€” a backslash can't escape it +// inside code, so replace with an apostrophe). Inputs are model-influenced, so +// this keeps a malformed file path or title from breaking the table. func mdCell(s string) string { s = strings.ReplaceAll(s, "\n", " ") s = strings.ReplaceAll(s, "|", "\\|") + s = strings.ReplaceAll(s, "`", "'") return strings.TrimSpace(s) } @@ -399,10 +454,3 @@ func plural(n int) string { } return "s" } - -func absInt(n int) int { - if n < 0 { - return -n - } - return n -} diff --git a/cmd/gadfly/consensus_test.go b/cmd/gadfly/consensus_test.go index 20a496d..595d54b 100644 --- a/cmd/gadfly/consensus_test.go +++ b/cmd/gadfly/consensus_test.go @@ -88,6 +88,61 @@ func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) { } } +func TestClusterSlidingWindowMergesChain(t *testing.T) { + // Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE + // cluster โ€” the window slides with the span instead of anchoring at line 10. + models := []modelFindings{ + {Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}}, + {Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}}, + {Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}}, + } + clusters := clusterFindings(models) + if len(clusters) != 1 { + t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters)) + } + if len(clusters[0].models) != 3 { + t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models)) + } +} + +func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) { + models := []modelFindings{ + {Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}}, + {Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}}, + {Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"}, + } + out := renderConsensus(models) + // Denominator is the 2 effective models, not 3; the failure is noted. + if !strings.Contains(out, "2/2") { + t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out) + } + if !strings.Contains(out, "1 failed") { + t.Errorf("expected a '1 failed' note:\n%s", out) + } + if !strings.Contains(out, "reviewer failed") { + t.Errorf("errored model should still appear (folded) as failed:\n%s", out) + } +} + +func TestRenderConsensusLoneHighFolds(t *testing.T) { + // A single-model HIGH (not critical) folds โ€” only consensus or a lone CRITICAL + // earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise. + models := []modelFindings{ + {Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{ + {Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}}, + } + out := renderConsensus(models) + head := out + if i := strings.Index(out, "single-model finding"); i >= 0 { + head = out[:i] + } + if strings.Contains(head, "maybe-bug") { + t.Errorf("a lone HIGH should fold, not headline:\n%s", out) + } +} + func TestWriteAndConsolidateRoundTrip(t *testing.T) { dir := t.TempDir() @@ -139,6 +194,7 @@ func captureStdout(t *testing.T, fn func()) string { t.Fatal(err) } os.Stdout = w + defer func() { os.Stdout = orig }() // restore even if fn panics done := make(chan string) go func() { var sb strings.Builder @@ -152,10 +208,10 @@ func captureStdout(t *testing.T, fn func()) string { break } } + r.Close() done <- sb.String() }() fn() w.Close() - os.Stdout = orig return <-done } diff --git a/entrypoint.sh b/entrypoint.sh index da812ac..46d4e76 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -235,7 +235,16 @@ case "${GADFLY_CONSOLIDATE:-auto}" in 0) CONSOLIDATE=0 ;; *) [ "${#MODEL_LIST[@]}" -ge 2 ] && CONSOLIDATE=1 ;; esac -findings_file_for() { echo "${FINDINGS_DIR}/$(echo "$1" | tr -c '[:alnum:]._-' '_').json"; } +# A model spec can contain '/' and ':' (e.g. claude-code/opus, qwen3:14b), so +# sanitize to a flat filename โ€” but append a checksum of the raw spec so two +# specs that sanitize the same (foo:bar vs foo/bar -> foo_bar) don't collide onto +# one file and silently drop a model from the consensus. +findings_file_for() { + local safe sum + safe="$(echo "$1" | tr -c '[:alnum:]._-' '_')" + sum="$(printf '%s' "$1" | cksum | cut -d' ' -f1)" + echo "${FINDINGS_DIR}/${safe}-${sum}.json" +} if [ "$CONSOLIDATE" = "1" ]; then rm -rf "$FINDINGS_DIR"; mkdir -p "$FINDINGS_DIR" log "consolidation ON: ${#MODEL_LIST[@]} models -> one consensus comment" diff --git a/scripts/run.sh b/scripts/run.sh index c692ef9..0a5fe49 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -214,6 +214,16 @@ esac # --- assemble + post final comment (with run time) -------------------------- ELAPSED="$(( $(date +%s) - START_TS ))" DUR="$(fmt_duration "$ELAPSED")" + +# Consolidating: the binary writes its findings file on success. If it failed or +# was skipped (no file, or an empty one), write a stub so this model still shows +# up in the consensus (as failed) and an all-models-fail run still posts a +# comment โ€” never silently drop a model or the whole review. +if [ "$CONSOLIDATE" = "1" ] && [ -n "${GADFLY_FINDINGS_OUT:-}" ] && [ ! -s "${GADFLY_FINDINGS_OUT}" ]; then + jq -n --arg model "$MODEL" --arg provider "$MODEL_PROVIDER" --arg md "$REVIEW" \ + '{model:$model, provider:$provider, verdict:"reviewer failed", errored:true, markdown:$md, findings:[]}' \ + > "${GADFLY_FINDINGS_OUT}" 2>/dev/null || true +fi # When consolidating, the binary has written this model's findings to # GADFLY_FINDINGS_OUT; the consensus comment is posted by entrypoint.sh after the # whole swarm finishes, so this model posts no comment of its own.