88f74aa768
Build & push image / build-and-push (push) Successful in 9s
Co-authored-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com> Co-committed-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
457 lines
14 KiB
Go
457 lines
14 KiB
Go
package main
|
|
|
|
// Cross-model consensus consolidation. The swarm runs each model independently
|
|
// (entrypoint.sh fans them out across provider lanes); historically each model
|
|
// posted its OWN comment, so a reader faced N walls of prose that mostly agreed.
|
|
//
|
|
// Instead, every model writes its findings to a shared directory
|
|
// (GADFLY_FINDINGS_OUT, one JSON file per model), and after the whole swarm
|
|
// finishes a single consolidation pass (GADFLY_CONSOLIDATE_DIR) clusters those
|
|
// findings by location, counts how many models independently flagged each one,
|
|
// and renders ONE comment: an agreement-ranked table up top (cross-model
|
|
// agreement is the strongest real-vs-false-positive signal we have), with each
|
|
// model's full review folded below for drill-down.
|
|
//
|
|
// This file owns: the per-model artifact (modelFindings), writing it
|
|
// (writeFindingsOut), reading the directory back, clustering, and rendering the
|
|
// consensus markdown (renderConsensus). It depends only on the stdlib.
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// consensusMarker tags the single consolidated comment so entrypoint.sh can
|
|
// upsert it in place across re-runs (mirrors run.sh's per-model marker).
|
|
const consensusMarker = "<!-- gadfly-consensus -->"
|
|
|
|
// modelFindings is the per-model artifact written to GADFLY_FINDINGS_OUT. It
|
|
// carries enough to rebuild the consolidated comment without re-running models:
|
|
// the structured findings (for clustering) plus the full rendered review (for
|
|
// the folded per-model drill-down).
|
|
type modelFindings struct {
|
|
Model string `json:"model"`
|
|
Provider string `json:"provider"`
|
|
Verdict string `json:"verdict"` // worst lens verdict, as a label
|
|
Errored bool `json:"errored"` // the model produced no usable review (every lens failed, or the run crashed)
|
|
Markdown string `json:"markdown"` // full rendered per-model review (findings block already stripped)
|
|
Findings []outFinding `json:"findings"`
|
|
}
|
|
|
|
// outFinding is one finding in the per-model artifact (a flattened `finding`
|
|
// plus its lens).
|
|
type outFinding struct {
|
|
Lens string `json:"lens"`
|
|
File string `json:"file"`
|
|
Line int `json:"line"`
|
|
Severity string `json:"severity"`
|
|
Confidence string `json:"confidence"`
|
|
Title string `json:"title"`
|
|
Detail string `json:"detail"`
|
|
}
|
|
|
|
// collectFindings returns a lens result's findings with severity always filled
|
|
// in: per-finding when the structured block supplied it, else derived from the
|
|
// lens verdict (so heuristic-scraped findings still carry a canonical word). A
|
|
// clean or errored lens yields nothing. Shared by the telemetry emit and the
|
|
// per-model findings file so both agree on what a finding is.
|
|
func collectFindings(r specialistResult) []finding {
|
|
if r.errored || r.verdict == verdictClean {
|
|
return nil
|
|
}
|
|
fs := extractStructuredFindingsOrScrape(r)
|
|
lensSev := r.verdict.severity()
|
|
for i := range fs {
|
|
if fs[i].severity == "" {
|
|
fs[i].severity = lensSev
|
|
}
|
|
}
|
|
return fs
|
|
}
|
|
|
|
// writeFindingsOut writes this model's findings + rendered review to
|
|
// GADFLY_FINDINGS_OUT for the later consolidation pass. No-op unless the env is
|
|
// set. Best-effort: any error is logged to stderr and never affects the review
|
|
// (it runs after the markdown is already on stdout).
|
|
func writeFindingsOut(results []specialistResult) {
|
|
path := strings.TrimSpace(os.Getenv("GADFLY_FINDINGS_OUT"))
|
|
if path == "" {
|
|
return
|
|
}
|
|
mf := modelFindings{
|
|
Model: strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
|
|
Provider: modelProvider(),
|
|
Verdict: worstVerdict(results).label(),
|
|
Errored: allErrored(results),
|
|
Markdown: renderConsolidated(results),
|
|
}
|
|
for _, r := range results {
|
|
for _, f := range collectFindings(r) {
|
|
mf.Findings = append(mf.Findings, outFinding{
|
|
Lens: r.spec.Name,
|
|
File: f.file,
|
|
Line: f.line,
|
|
Severity: f.severity,
|
|
Confidence: f.confidence,
|
|
Title: f.title,
|
|
Detail: f.detail,
|
|
})
|
|
}
|
|
}
|
|
data, err := json.Marshal(mf)
|
|
if err != nil {
|
|
fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
|
|
return
|
|
}
|
|
// Defensive: make sure the parent dir exists (entrypoint creates it, but a
|
|
// missing dir would otherwise silently drop this model from the consensus).
|
|
if dir := filepath.Dir(path); dir != "" {
|
|
_ = os.MkdirAll(dir, 0o755)
|
|
}
|
|
if err := os.WriteFile(path, data, 0o644); err != nil {
|
|
fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
|
|
}
|
|
}
|
|
|
|
// allErrored reports whether every lens of a review failed (so the model
|
|
// produced no usable findings). Such a model is recorded but excluded from the
|
|
// consensus agreement denominator — counting it would dilute every ratio with a
|
|
// model that never actually reviewed.
|
|
func allErrored(results []specialistResult) bool {
|
|
if len(results) == 0 {
|
|
return true
|
|
}
|
|
for _, r := range results {
|
|
if !r.errored {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
|
|
// read every per-model artifact in the directory, render the consensus comment
|
|
// to stdout. Errors are fatal to THIS process only — entrypoint.sh treats a
|
|
// failed consolidation as advisory and falls back to per-model comments.
|
|
func runConsolidate() error {
|
|
dir := strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR"))
|
|
if dir == "" {
|
|
return errors.New("GADFLY_CONSOLIDATE_DIR is empty")
|
|
}
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
return fmt.Errorf("read consolidate dir: %w", err)
|
|
}
|
|
var models []modelFindings
|
|
for _, e := range entries {
|
|
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
|
|
continue
|
|
}
|
|
data, err := os.ReadFile(filepath.Join(dir, e.Name()))
|
|
if err != nil {
|
|
fmt.Fprintln(os.Stderr, "gadfly: read", e.Name(), err)
|
|
continue
|
|
}
|
|
var mf modelFindings
|
|
if err := json.Unmarshal(data, &mf); err != nil {
|
|
fmt.Fprintln(os.Stderr, "gadfly: parse", e.Name(), err)
|
|
continue
|
|
}
|
|
if strings.TrimSpace(mf.Model) == "" {
|
|
continue
|
|
}
|
|
models = append(models, mf)
|
|
}
|
|
if len(models) == 0 {
|
|
return errors.New("no model findings to consolidate")
|
|
}
|
|
// Lead with the marker so entrypoint.sh can upsert this comment in place
|
|
// (same pattern as run.sh's per-model marker); it appends the advisory footer.
|
|
fmt.Println(consensusMarker)
|
|
fmt.Println(renderConsensus(models))
|
|
return nil
|
|
}
|
|
|
|
// cluster is a group of findings (across models) judged to be the same issue:
|
|
// same file, lines within lineTolerance of the cluster's current span. The span
|
|
// [line,maxLine] slides as members join, so a chain of nearby findings merges
|
|
// instead of splitting once it drifts past the first line.
|
|
type cluster struct {
|
|
file string
|
|
line int // representative (smallest) line
|
|
maxLine int // largest line in the cluster — the span's upper edge
|
|
severity string
|
|
title string
|
|
models map[string]bool
|
|
lenses map[string]bool
|
|
}
|
|
|
|
// findingRef is one model's finding (carrying which model reported it), used
|
|
// while grouping findings into clusters.
|
|
type findingRef struct {
|
|
f outFinding
|
|
model string
|
|
}
|
|
|
|
// lineTolerance: a finding in the same file within this many lines of a
|
|
// cluster's current span is treated as the same issue (models often cite a line
|
|
// or two apart).
|
|
const lineTolerance = 3
|
|
|
|
// renderConsensus builds the single consolidated comment body from every model's
|
|
// findings. It does NOT emit the marker or advisory footer — entrypoint.sh wraps
|
|
// it (mirroring run.sh's per-model framing).
|
|
func renderConsensus(models []modelFindings) string {
|
|
// effective = models that actually produced a review. Errored models are
|
|
// shown (folded, below) but excluded from the agreement denominator so a
|
|
// failed model doesn't dilute every ratio.
|
|
effective := 0
|
|
for _, m := range models {
|
|
if !m.Errored {
|
|
effective++
|
|
}
|
|
}
|
|
errored := len(models) - effective
|
|
clusters := clusterFindings(models)
|
|
|
|
worst := verdictClean
|
|
for _, m := range models {
|
|
if v := parseVerdict(m.Verdict); v > worst {
|
|
worst = v
|
|
}
|
|
}
|
|
|
|
// Partition in one pass: "headline" findings (multi-model agreement, OR a
|
|
// lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
|
|
// count multi-model agreements for the summary line.
|
|
var headline, folded []cluster
|
|
agreed := 0
|
|
for _, c := range clusters {
|
|
if len(c.models) >= 2 {
|
|
agreed++
|
|
}
|
|
if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
|
|
headline = append(headline, c)
|
|
} else {
|
|
folded = append(folded, c)
|
|
}
|
|
}
|
|
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "## 🪰 Gadfly review — consensus across %d model%s", effective, plural(effective))
|
|
if errored > 0 {
|
|
fmt.Fprintf(&b, " (%d failed)", errored)
|
|
}
|
|
b.WriteString("\n\n")
|
|
fmt.Fprintf(&b, "**Verdict: %s** · %d finding%s (%d with multi-model agreement)\n",
|
|
worst.label(), len(clusters), plural(len(clusters)), agreed)
|
|
|
|
if len(headline) > 0 {
|
|
b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
|
|
for _, c := range headline {
|
|
fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
|
|
sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
|
|
len(c.models), effective, mdCell(lensList(c.lenses)))
|
|
}
|
|
} else if len(clusters) == 0 {
|
|
b.WriteString("\nNo material issues found by consensus.\n")
|
|
}
|
|
// else: only single-model findings — they're shown folded below, so don't
|
|
// claim "no material issues" (there are some, just none with consensus).
|
|
|
|
if len(folded) > 0 {
|
|
fmt.Fprintf(&b, "\n<details><summary>%d single-model finding%s (lower confidence)</summary>\n\n",
|
|
len(folded), plural(len(folded)))
|
|
b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
|
|
for _, c := range folded {
|
|
fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
|
|
sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
|
|
mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
|
|
}
|
|
b.WriteString("\n</details>\n")
|
|
}
|
|
|
|
// Per-model full reviews, folded for drill-down (nothing is lost).
|
|
b.WriteString("\n<details><summary>Per-model detail</summary>\n")
|
|
for _, m := range models {
|
|
body := strings.TrimSpace(m.Markdown)
|
|
if body == "" {
|
|
body = "_(no output)_"
|
|
}
|
|
verdict := m.Verdict
|
|
if m.Errored {
|
|
verdict = "⚠️ reviewer failed"
|
|
}
|
|
fmt.Fprintf(&b, "\n<details><summary><b>%s</b> (%s) — %s</summary>\n\n%s\n\n</details>\n",
|
|
mdCell(m.Model), mdCell(m.Provider), verdict, body)
|
|
}
|
|
b.WriteString("\n</details>")
|
|
return b.String()
|
|
}
|
|
|
|
// clusterFindings groups every model's findings into cross-model clusters,
|
|
// sorted by agreement (desc), then severity (desc), then location.
|
|
func clusterFindings(models []modelFindings) []cluster {
|
|
// Group by file, then greedily merge by line proximity.
|
|
byFile := map[string][]findingRef{}
|
|
for _, m := range models {
|
|
for _, f := range m.Findings {
|
|
if strings.TrimSpace(f.File) == "" {
|
|
continue
|
|
}
|
|
byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
|
|
}
|
|
}
|
|
|
|
var clusters []cluster
|
|
for file, items := range byFile {
|
|
sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
|
|
// Cluster within THIS file only (clusters never span files), so the inner
|
|
// scan is over same-file clusters, not every cluster seen so far.
|
|
var fileClusters []cluster
|
|
for _, it := range items {
|
|
placed := false
|
|
for ci := range fileClusters {
|
|
c := &fileClusters[ci]
|
|
// Join if the line falls within the cluster's span, widened by the
|
|
// tolerance on both edges — so the window slides as the span grows.
|
|
if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
|
|
mergeIntoCluster(c, it.f, it.model)
|
|
placed = true
|
|
break
|
|
}
|
|
}
|
|
if !placed {
|
|
c := cluster{
|
|
file: file,
|
|
line: it.f.Line,
|
|
maxLine: it.f.Line,
|
|
severity: it.f.Severity,
|
|
title: it.f.Title,
|
|
models: map[string]bool{},
|
|
lenses: map[string]bool{},
|
|
}
|
|
mergeIntoCluster(&c, it.f, it.model)
|
|
fileClusters = append(fileClusters, c)
|
|
}
|
|
}
|
|
clusters = append(clusters, fileClusters...)
|
|
}
|
|
|
|
sort.SliceStable(clusters, func(i, j int) bool {
|
|
if len(clusters[i].models) != len(clusters[j].models) {
|
|
return len(clusters[i].models) > len(clusters[j].models)
|
|
}
|
|
if sevRank(clusters[i].severity) != sevRank(clusters[j].severity) {
|
|
return sevRank(clusters[i].severity) > sevRank(clusters[j].severity)
|
|
}
|
|
if clusters[i].file != clusters[j].file {
|
|
return clusters[i].file < clusters[j].file
|
|
}
|
|
return clusters[i].line < clusters[j].line
|
|
})
|
|
return clusters
|
|
}
|
|
|
|
// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
|
|
// widen the [line,maxLine] span, and keep the highest-severity report's title.
|
|
func mergeIntoCluster(c *cluster, f outFinding, model string) {
|
|
if model != "" {
|
|
c.models[model] = true
|
|
}
|
|
if f.Lens != "" {
|
|
c.lenses[f.Lens] = true
|
|
}
|
|
if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
|
|
c.line = f.Line
|
|
}
|
|
if f.Line > c.maxLine {
|
|
c.maxLine = f.Line
|
|
}
|
|
if sevRank(f.Severity) > sevRank(c.severity) {
|
|
c.severity = f.Severity
|
|
if strings.TrimSpace(f.Title) != "" {
|
|
c.title = f.Title
|
|
}
|
|
}
|
|
}
|
|
|
|
// sevRank orders the canonical severity words for sorting/comparison.
|
|
func sevRank(s string) int {
|
|
switch strings.ToLower(strings.TrimSpace(s)) {
|
|
case "critical":
|
|
return 5
|
|
case "high":
|
|
return 4
|
|
case "medium":
|
|
return 3
|
|
case "small":
|
|
return 2
|
|
case "trivial":
|
|
return 1
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// sevIcon is the at-a-glance severity badge for the consensus table.
|
|
func sevIcon(s string) string {
|
|
switch strings.ToLower(strings.TrimSpace(s)) {
|
|
case "critical", "high":
|
|
return "🔴"
|
|
case "medium":
|
|
return "🟠"
|
|
case "small":
|
|
return "🟡"
|
|
default:
|
|
return "⚪"
|
|
}
|
|
}
|
|
|
|
func location(file string, line int) string {
|
|
if line > 0 {
|
|
return fmt.Sprintf("%s:%d", file, line)
|
|
}
|
|
return file
|
|
}
|
|
|
|
func lensList(lenses map[string]bool) string {
|
|
out := make([]string, 0, len(lenses))
|
|
for l := range lenses {
|
|
out = append(out, l)
|
|
}
|
|
sort.Strings(out)
|
|
return strings.Join(out, ", ")
|
|
}
|
|
|
|
func oneModel(models map[string]bool) string {
|
|
for m := range models {
|
|
return m
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// mdCell makes a string safe for a one-line markdown table cell: collapse
|
|
// newlines, escape pipes (which delimit columns), and neutralize backticks
|
|
// (a stray one would break an inline-code span — a backslash can't escape it
|
|
// inside code, so replace with an apostrophe). Inputs are model-influenced, so
|
|
// this keeps a malformed file path or title from breaking the table.
|
|
func mdCell(s string) string {
|
|
s = strings.ReplaceAll(s, "\n", " ")
|
|
s = strings.ReplaceAll(s, "|", "\\|")
|
|
s = strings.ReplaceAll(s, "`", "'")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
func plural(n int) string {
|
|
if n == 1 {
|
|
return ""
|
|
}
|
|
return "s"
|
|
}
|