fix: address swarm review of consensus consolidation (PR #17)
Build & push image / build-and-push (pull_request) Successful in 6s
Build & push image / build-and-push (pull_request) Successful in 6s
From PR #17's own adversarial review (now running on the Phase-1 image — note the much cleaner, less-fragmented findings). Graded in gadfly-reports. - Never lose output (advisory invariant): run.sh writes a stub findings file when a model fails/skips under consolidation, so a failed model still shows (as failed) and an all-models-fail run still posts a comment. - Errored models excluded from the agreement denominator (modelFindings.Errored + allErrored); shown folded as "reviewer failed", not counted as clean. - Sliding cluster window: span [line,maxLine] widened by tolerance on both edges, so chains of nearby findings merge instead of splitting (fixes agreement under-counting). Clustering is now per-file (no global O(n²) scan). - Lone-finding headline threshold raised high -> critical (matches the docs); only consensus or a lone critical earns the headline, the rest fold. - findings_file_for appends a cksum so foo:bar vs foo/bar can't collide. - mdCell escapes the location cell + neutralizes backticks (table-injection); "No material issues" no longer shown when folded findings exist. - Removed dead cluster.detail; named findingRef type; single partition/agreed pass; writeFindingsOut MkdirAll; captureStdout defer-restore + close. Tests added: sliding-window chain merge, errored-denominator exclusion, lone HIGH folds. gofmt/vet/bash -n clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+93
-45
@@ -38,6 +38,7 @@ type modelFindings struct {
|
||||
Model string `json:"model"`
|
||||
Provider string `json:"provider"`
|
||||
Verdict string `json:"verdict"` // worst lens verdict, as a label
|
||||
Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed)
|
||||
Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped)
|
||||
Findings []outFinding `json:"findings"`
|
||||
}
|
||||
@@ -86,6 +87,7 @@ func writeFindingsOut(results []specialistResult) {
|
||||
Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
|
||||
Provider: modelProvider(),
|
||||
Verdict: worstVerdict(results).label(),
|
||||
Errored: allErrored(results),
|
||||
Markdown: renderConsolidated(results),
|
||||
}
|
||||
for _, r := range results {
|
||||
@@ -106,11 +108,32 @@ func writeFindingsOut(results []specialistResult) {
|
||||
fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
|
||||
return
|
||||
}
|
||||
// Defensive: make sure the parent dir exists (entrypoint creates it, but a
|
||||
// missing dir would otherwise silently drop this model from the consensus).
|
||||
if dir := filepath.Dir(path); dir != "" {
|
||||
_ = os.MkdirAll(dir, 0o755)
|
||||
}
|
||||
if err := os.WriteFile(path, data, 0o644); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
|
||||
}
|
||||
}
|
||||
|
||||
// allErrored reports whether every lens of a review failed (so the model
|
||||
// produced no usable findings). Such a model is recorded but excluded from the
|
||||
// consensus agreement denominator — counting it would dilute every ratio with a
|
||||
// model that never actually reviewed.
|
||||
func allErrored(results []specialistResult) bool {
|
||||
if len(results) == 0 {
|
||||
return true
|
||||
}
|
||||
for _, r := range results {
|
||||
if !r.errored {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
|
||||
// read every per-model artifact in the directory, render the consensus comment
|
||||
// to stdout. Errors are fatal to THIS process only — entrypoint.sh treats a
|
||||
@@ -155,26 +178,45 @@ func runConsolidate() error {
|
||||
}
|
||||
|
||||
// cluster is a group of findings (across models) judged to be the same issue:
|
||||
// same file, lines within lineTolerance of each other.
|
||||
// same file, lines within lineTolerance of the cluster's current span. The span
|
||||
// [line,maxLine] slides as members join, so a chain of nearby findings merges
|
||||
// instead of splitting once it drifts past the first line.
|
||||
type cluster struct {
|
||||
file string
|
||||
line int // representative (smallest) line
|
||||
maxLine int // largest line in the cluster — the span's upper edge
|
||||
severity string
|
||||
title string
|
||||
detail string
|
||||
models map[string]bool
|
||||
lenses map[string]bool
|
||||
}
|
||||
|
||||
// lineTolerance: findings in the same file within this many lines of each other
|
||||
// are treated as the same issue (models often cite a line or two apart).
|
||||
// findingRef is one model's finding (carrying which model reported it), used
|
||||
// while grouping findings into clusters.
|
||||
type findingRef struct {
|
||||
f outFinding
|
||||
model string
|
||||
}
|
||||
|
||||
// lineTolerance: a finding in the same file within this many lines of a
|
||||
// cluster's current span is treated as the same issue (models often cite a line
|
||||
// or two apart).
|
||||
const lineTolerance = 3
|
||||
|
||||
// renderConsensus builds the single consolidated comment body from every model's
|
||||
// findings. It does NOT emit the marker or advisory footer — entrypoint.sh wraps
|
||||
// it (mirroring run.sh's per-model framing).
|
||||
func renderConsensus(models []modelFindings) string {
|
||||
total := len(models)
|
||||
// effective = models that actually produced a review. Errored models are
|
||||
// shown (folded, below) but excluded from the agreement denominator so a
|
||||
// failed model doesn't dilute every ratio.
|
||||
effective := 0
|
||||
for _, m := range models {
|
||||
if !m.Errored {
|
||||
effective++
|
||||
}
|
||||
}
|
||||
errored := len(models) - effective
|
||||
clusters := clusterFindings(models)
|
||||
|
||||
worst := verdictClean
|
||||
@@ -184,11 +226,16 @@ func renderConsensus(models []modelFindings) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Partition: "headline" findings (multi-model agreement, OR a high/critical
|
||||
// even from one model) vs folded "single-model" lower-severity noise.
|
||||
// Partition in one pass: "headline" findings (multi-model agreement, OR a
|
||||
// lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
|
||||
// count multi-model agreements for the summary line.
|
||||
var headline, folded []cluster
|
||||
agreed := 0
|
||||
for _, c := range clusters {
|
||||
if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("high") {
|
||||
if len(c.models) >= 2 {
|
||||
agreed++
|
||||
}
|
||||
if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
|
||||
headline = append(headline, c)
|
||||
} else {
|
||||
folded = append(folded, c)
|
||||
@@ -196,13 +243,11 @@ func renderConsensus(models []modelFindings) string {
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "## 🪰 Gadfly review — consensus across %d model%s\n\n", total, plural(total))
|
||||
agreed := 0
|
||||
for _, c := range clusters {
|
||||
if len(c.models) >= 2 {
|
||||
agreed++
|
||||
}
|
||||
fmt.Fprintf(&b, "## 🪰 Gadfly review — consensus across %d model%s", effective, plural(effective))
|
||||
if errored > 0 {
|
||||
fmt.Fprintf(&b, " (%d failed)", errored)
|
||||
}
|
||||
b.WriteString("\n\n")
|
||||
fmt.Fprintf(&b, "**Verdict: %s** · %d finding%s (%d with multi-model agreement)\n",
|
||||
worst.label(), len(clusters), plural(len(clusters)), agreed)
|
||||
|
||||
@@ -210,12 +255,14 @@ func renderConsensus(models []modelFindings) string {
|
||||
b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
|
||||
for _, c := range headline {
|
||||
fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
|
||||
sevIcon(c.severity), mdCell(c.title), location(c.file, c.line),
|
||||
len(c.models), total, mdCell(lensList(c.lenses)))
|
||||
sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
|
||||
len(c.models), effective, mdCell(lensList(c.lenses)))
|
||||
}
|
||||
} else {
|
||||
} else if len(clusters) == 0 {
|
||||
b.WriteString("\nNo material issues found by consensus.\n")
|
||||
}
|
||||
// else: only single-model findings — they're shown folded below, so don't
|
||||
// claim "no material issues" (there are some, just none with consensus).
|
||||
|
||||
if len(folded) > 0 {
|
||||
fmt.Fprintf(&b, "\n<details><summary>%d single-model finding%s (lower confidence)</summary>\n\n",
|
||||
@@ -223,7 +270,7 @@ func renderConsensus(models []modelFindings) string {
|
||||
b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
|
||||
for _, c := range folded {
|
||||
fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
|
||||
sevIcon(c.severity), mdCell(c.title), location(c.file, c.line),
|
||||
sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
|
||||
mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
|
||||
}
|
||||
b.WriteString("\n</details>\n")
|
||||
@@ -236,8 +283,12 @@ func renderConsensus(models []modelFindings) string {
|
||||
if body == "" {
|
||||
body = "_(no output)_"
|
||||
}
|
||||
verdict := m.Verdict
|
||||
if m.Errored {
|
||||
verdict = "⚠️ reviewer failed"
|
||||
}
|
||||
fmt.Fprintf(&b, "\n<details><summary><b>%s</b> (%s) — %s</summary>\n\n%s\n\n</details>\n",
|
||||
mdCell(m.Model), mdCell(m.Provider), m.Verdict, body)
|
||||
mdCell(m.Model), mdCell(m.Provider), verdict, body)
|
||||
}
|
||||
b.WriteString("\n</details>")
|
||||
return b.String()
|
||||
@@ -247,30 +298,29 @@ func renderConsensus(models []modelFindings) string {
|
||||
// sorted by agreement (desc), then severity (desc), then location.
|
||||
func clusterFindings(models []modelFindings) []cluster {
|
||||
// Group by file, then greedily merge by line proximity.
|
||||
byFile := map[string][]struct {
|
||||
f outFinding
|
||||
model string
|
||||
}{}
|
||||
byFile := map[string][]findingRef{}
|
||||
for _, m := range models {
|
||||
for _, f := range m.Findings {
|
||||
if strings.TrimSpace(f.File) == "" {
|
||||
continue
|
||||
}
|
||||
byFile[f.File] = append(byFile[f.File], struct {
|
||||
f outFinding
|
||||
model string
|
||||
}{f, m.Model})
|
||||
byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
|
||||
}
|
||||
}
|
||||
|
||||
var clusters []cluster
|
||||
for file, items := range byFile {
|
||||
sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
|
||||
// Cluster within THIS file only (clusters never span files), so the inner
|
||||
// scan is over same-file clusters, not every cluster seen so far.
|
||||
var fileClusters []cluster
|
||||
for _, it := range items {
|
||||
placed := false
|
||||
for ci := range clusters {
|
||||
c := &clusters[ci]
|
||||
if c.file == file && absInt(c.line-it.f.Line) <= lineTolerance {
|
||||
for ci := range fileClusters {
|
||||
c := &fileClusters[ci]
|
||||
// Join if the line falls within the cluster's span, widened by the
|
||||
// tolerance on both edges — so the window slides as the span grows.
|
||||
if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
|
||||
mergeIntoCluster(c, it.f, it.model)
|
||||
placed = true
|
||||
break
|
||||
@@ -280,16 +330,17 @@ func clusterFindings(models []modelFindings) []cluster {
|
||||
c := cluster{
|
||||
file: file,
|
||||
line: it.f.Line,
|
||||
maxLine: it.f.Line,
|
||||
severity: it.f.Severity,
|
||||
title: it.f.Title,
|
||||
detail: it.f.Detail,
|
||||
models: map[string]bool{},
|
||||
lenses: map[string]bool{},
|
||||
}
|
||||
mergeIntoCluster(&c, it.f, it.model)
|
||||
clusters = append(clusters, c)
|
||||
fileClusters = append(fileClusters, c)
|
||||
}
|
||||
}
|
||||
clusters = append(clusters, fileClusters...)
|
||||
}
|
||||
|
||||
sort.SliceStable(clusters, func(i, j int) bool {
|
||||
@@ -308,7 +359,7 @@ func clusterFindings(models []modelFindings) []cluster {
|
||||
}
|
||||
|
||||
// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
|
||||
// keep the smallest line, and keep the highest-severity report's title/detail.
|
||||
// widen the [line,maxLine] span, and keep the highest-severity report's title.
|
||||
func mergeIntoCluster(c *cluster, f outFinding, model string) {
|
||||
if model != "" {
|
||||
c.models[model] = true
|
||||
@@ -319,14 +370,14 @@ func mergeIntoCluster(c *cluster, f outFinding, model string) {
|
||||
if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
|
||||
c.line = f.Line
|
||||
}
|
||||
if f.Line > c.maxLine {
|
||||
c.maxLine = f.Line
|
||||
}
|
||||
if sevRank(f.Severity) > sevRank(c.severity) {
|
||||
c.severity = f.Severity
|
||||
if strings.TrimSpace(f.Title) != "" {
|
||||
c.title = f.Title
|
||||
}
|
||||
if strings.TrimSpace(f.Detail) != "" {
|
||||
c.detail = f.Detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,11 +436,15 @@ func oneModel(models map[string]bool) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// mdCell makes a string safe for a one-line markdown table cell: escape pipes,
|
||||
// collapse newlines.
|
||||
// mdCell makes a string safe for a one-line markdown table cell: collapse
|
||||
// newlines, escape pipes (which delimit columns), and neutralize backticks
|
||||
// (a stray one would break an inline-code span — a backslash can't escape it
|
||||
// inside code, so replace with an apostrophe). Inputs are model-influenced, so
|
||||
// this keeps a malformed file path or title from breaking the table.
|
||||
func mdCell(s string) string {
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
s = strings.ReplaceAll(s, "|", "\\|")
|
||||
s = strings.ReplaceAll(s, "`", "'")
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
@@ -399,10 +454,3 @@ func plural(n int) string {
|
||||
}
|
||||
return "s"
|
||||
}
|
||||
|
||||
func absInt(n int) int {
|
||||
if n < 0 {
|
||||
return -n
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
@@ -88,6 +88,61 @@ func TestRenderConsensusHighSeverityLoneFindingStaysHeadline(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterSlidingWindowMergesChain(t *testing.T) {
|
||||
// Findings at 10, 13, 16 (each 3 apart) from three models must merge into ONE
|
||||
// cluster — the window slides with the span instead of anchoring at line 10.
|
||||
models := []modelFindings{
|
||||
{Model: "m1", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 10, Severity: "medium", Title: "t"}}},
|
||||
{Model: "m2", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 13, Severity: "medium", Title: "t"}}},
|
||||
{Model: "m3", Findings: []outFinding{{Lens: "x", File: "a.go", Line: 16, Severity: "medium", Title: "t"}}},
|
||||
}
|
||||
clusters := clusterFindings(models)
|
||||
if len(clusters) != 1 {
|
||||
t.Fatalf("chain 10/13/16 should merge into 1 cluster, got %d", len(clusters))
|
||||
}
|
||||
if len(clusters[0].models) != 3 {
|
||||
t.Errorf("want 3 models in the merged cluster, got %d", len(clusters[0].models))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderConsensusExcludesErroredFromDenominator(t *testing.T) {
|
||||
models := []modelFindings{
|
||||
{Model: "m1", Verdict: "Minor issues", Markdown: "a", Findings: []outFinding{
|
||||
{Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
|
||||
{Model: "m2", Verdict: "Minor issues", Markdown: "b", Findings: []outFinding{
|
||||
{Lens: "security", File: "a.go", Line: 9, Severity: "medium", Title: "leak"}}},
|
||||
{Model: "broken", Verdict: "reviewer failed", Errored: true, Markdown: "boom"},
|
||||
}
|
||||
out := renderConsensus(models)
|
||||
// Denominator is the 2 effective models, not 3; the failure is noted.
|
||||
if !strings.Contains(out, "2/2") {
|
||||
t.Errorf("errored model must be excluded from the denominator (want 2/2):\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "1 failed") {
|
||||
t.Errorf("expected a '1 failed' note:\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "reviewer failed") {
|
||||
t.Errorf("errored model should still appear (folded) as failed:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderConsensusLoneHighFolds(t *testing.T) {
|
||||
// A single-model HIGH (not critical) folds — only consensus or a lone CRITICAL
|
||||
// earns the headline, so a lone Blocking-lens finding doesn't reintroduce noise.
|
||||
models := []modelFindings{
|
||||
{Model: "solo", Verdict: "Blocking issues found", Markdown: "x", Findings: []outFinding{
|
||||
{Lens: "security", File: "a.go", Line: 1, Severity: "high", Title: "maybe-bug"}}},
|
||||
}
|
||||
out := renderConsensus(models)
|
||||
head := out
|
||||
if i := strings.Index(out, "single-model finding"); i >= 0 {
|
||||
head = out[:i]
|
||||
}
|
||||
if strings.Contains(head, "maybe-bug") {
|
||||
t.Errorf("a lone HIGH should fold, not headline:\n%s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteAndConsolidateRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
@@ -139,6 +194,7 @@ func captureStdout(t *testing.T, fn func()) string {
|
||||
t.Fatal(err)
|
||||
}
|
||||
os.Stdout = w
|
||||
defer func() { os.Stdout = orig }() // restore even if fn panics
|
||||
done := make(chan string)
|
||||
go func() {
|
||||
var sb strings.Builder
|
||||
@@ -152,10 +208,10 @@ func captureStdout(t *testing.T, fn func()) string {
|
||||
break
|
||||
}
|
||||
}
|
||||
r.Close()
|
||||
done <- sb.String()
|
||||
}()
|
||||
fn()
|
||||
w.Close()
|
||||
os.Stdout = orig
|
||||
return <-done
|
||||
}
|
||||
|
||||
+10
-1
@@ -235,7 +235,16 @@ case "${GADFLY_CONSOLIDATE:-auto}" in
|
||||
0) CONSOLIDATE=0 ;;
|
||||
*) [ "${#MODEL_LIST[@]}" -ge 2 ] && CONSOLIDATE=1 ;;
|
||||
esac
|
||||
findings_file_for() { echo "${FINDINGS_DIR}/$(echo "$1" | tr -c '[:alnum:]._-' '_').json"; }
|
||||
# A model spec can contain '/' and ':' (e.g. claude-code/opus, qwen3:14b), so
|
||||
# sanitize to a flat filename — but append a checksum of the raw spec so two
|
||||
# specs that sanitize the same (foo:bar vs foo/bar -> foo_bar) don't collide onto
|
||||
# one file and silently drop a model from the consensus.
|
||||
findings_file_for() {
|
||||
local safe sum
|
||||
safe="$(echo "$1" | tr -c '[:alnum:]._-' '_')"
|
||||
sum="$(printf '%s' "$1" | cksum | cut -d' ' -f1)"
|
||||
echo "${FINDINGS_DIR}/${safe}-${sum}.json"
|
||||
}
|
||||
if [ "$CONSOLIDATE" = "1" ]; then
|
||||
rm -rf "$FINDINGS_DIR"; mkdir -p "$FINDINGS_DIR"
|
||||
log "consolidation ON: ${#MODEL_LIST[@]} models -> one consensus comment"
|
||||
|
||||
@@ -214,6 +214,16 @@ esac
|
||||
# --- assemble + post final comment (with run time) --------------------------
|
||||
ELAPSED="$(( $(date +%s) - START_TS ))"
|
||||
DUR="$(fmt_duration "$ELAPSED")"
|
||||
|
||||
# Consolidating: the binary writes its findings file on success. If it failed or
|
||||
# was skipped (no file, or an empty one), write a stub so this model still shows
|
||||
# up in the consensus (as failed) and an all-models-fail run still posts a
|
||||
# comment — never silently drop a model or the whole review.
|
||||
if [ "$CONSOLIDATE" = "1" ] && [ -n "${GADFLY_FINDINGS_OUT:-}" ] && [ ! -s "${GADFLY_FINDINGS_OUT}" ]; then
|
||||
jq -n --arg model "$MODEL" --arg provider "$MODEL_PROVIDER" --arg md "$REVIEW" \
|
||||
'{model:$model, provider:$provider, verdict:"reviewer failed", errored:true, markdown:$md, findings:[]}' \
|
||||
> "${GADFLY_FINDINGS_OUT}" 2>/dev/null || true
|
||||
fi
|
||||
# When consolidating, the binary has written this model's findings to
|
||||
# GADFLY_FINDINGS_OUT; the consensus comment is posted by entrypoint.sh after the
|
||||
# whole swarm finishes, so this model posts no comment of its own.
|
||||
|
||||
Reference in New Issue
Block a user