gadfly/cmd/gadfly/consensus.go

package main

// Cross-model consensus consolidation. The swarm runs each model independently
// (entrypoint.sh fans them out across provider lanes); historically each model
// posted its OWN comment, so a reader faced N walls of prose that mostly agreed.
//
// Instead, every model writes its findings to a shared directory
// (GADFLY_FINDINGS_OUT, one JSON file per model), and after the whole swarm
// finishes a single consolidation pass (GADFLY_CONSOLIDATE_DIR) clusters those
// findings by location, counts how many models independently flagged each one,
// and renders ONE comment: an agreement-ranked table up top (cross-model
// agreement is the strongest real-vs-false-positive signal we have), with each
// model's full review folded below for drill-down.
//
// This file owns: the per-model artifact (modelFindings), writing it
// (writeFindingsOut), reading the directory back, clustering, and rendering the
// consensus markdown (renderConsensus). It depends only on the stdlib.

import (
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
)

// consensusMarker tags the single consolidated comment so entrypoint.sh can
// upsert it in place across re-runs (mirrors run.sh's per-model marker).
const consensusMarker = "<!-- gadfly-consensus -->"

// modelFindings is the per-model artifact written to GADFLY_FINDINGS_OUT. It
// carries enough to rebuild the consolidated comment without re-running models:
// the structured findings (for clustering) plus the full rendered review (for
// the folded per-model drill-down).
type modelFindings struct {
	Model    string       `json:"model"`
	Provider string       `json:"provider"`
	Verdict  string       `json:"verdict"`  // worst lens verdict, as a label
	Errored  bool         `json:"errored"`  // the model produced no usable review (every lens failed, or the run crashed)
	Markdown string       `json:"markdown"` // full rendered per-model review (findings block already stripped)
	Findings []outFinding `json:"findings"`
}

// outFinding is one finding in the per-model artifact (a flattened `finding`
// plus its lens).
type outFinding struct {
	Lens       string `json:"lens"`
	File       string `json:"file"`
	Line       int    `json:"line"`
	Severity   string `json:"severity"`
	Confidence string `json:"confidence"`
	Title      string `json:"title"`
	Detail     string `json:"detail"`
}

// collectFindings returns a lens result's findings with severity always filled
// in: per-finding when the structured block supplied it, else derived from the
// lens verdict (so heuristic-scraped findings still carry a canonical word). A
// clean or errored lens yields nothing. Shared by the telemetry emit and the
// per-model findings file so both agree on what a finding is.
func collectFindings(r specialistResult) []finding {
	if r.errored || r.verdict == verdictClean {
		return nil
	}
	fs := extractStructuredFindingsOrScrape(r)
	lensSev := r.verdict.severity()
	for i := range fs {
		if fs[i].severity == "" {
			fs[i].severity = lensSev
		}
	}
	return fs
}

// writeFindingsOut writes this model's findings + rendered review to
// GADFLY_FINDINGS_OUT for the later consolidation pass. No-op unless the env is
// set. Best-effort: any error is logged to stderr and never affects the review
// (it runs after the markdown is already on stdout).
func writeFindingsOut(results []specialistResult) {
	path := strings.TrimSpace(os.Getenv("GADFLY_FINDINGS_OUT"))
	if path == "" {
		return
	}
	mf := modelFindings{
		Model:    strings.TrimSpace(os.Getenv("GADFLY_MODEL")),
		Provider: modelProvider(),
		Verdict:  worstVerdict(results).label(),
		Errored:  allErrored(results),
		Markdown: renderConsolidated(results),
	}
	for _, r := range results {
		for _, f := range collectFindings(r) {
			mf.Findings = append(mf.Findings, outFinding{
				Lens:       r.spec.Name,
				File:       f.file,
				Line:       f.line,
				Severity:   f.severity,
				Confidence: f.confidence,
				Title:      f.title,
				Detail:     f.detail,
			})
		}
	}
	data, err := json.Marshal(mf)
	if err != nil {
		fmt.Fprintln(os.Stderr, "gadfly: marshal findings out:", err)
		return
	}
	// Defensive: make sure the parent dir exists (entrypoint creates it, but a
	// missing dir would otherwise silently drop this model from the consensus).
	if dir := filepath.Dir(path); dir != "" {
		_ = os.MkdirAll(dir, 0o755)
	}
	if err := os.WriteFile(path, data, 0o644); err != nil {
		fmt.Fprintln(os.Stderr, "gadfly: write findings out:", err)
	}
}

// allErrored reports whether every lens of a review failed (so the model
// produced no usable findings). Such a model is recorded but excluded from the
// consensus agreement denominator — counting it would dilute every ratio with a
// model that never actually reviewed.
func allErrored(results []specialistResult) bool {
	if len(results) == 0 {
		return true
	}
	for _, r := range results {
		if !r.errored {
			return false
		}
	}
	return true
}

// runConsolidate is the consolidation entry point (GADFLY_CONSOLIDATE_DIR set):
// read every per-model artifact in the directory, render the consensus comment
// to stdout. Errors are fatal to THIS process only — entrypoint.sh treats a
// failed consolidation as advisory and falls back to per-model comments.
func runConsolidate() error {
	dir := strings.TrimSpace(os.Getenv("GADFLY_CONSOLIDATE_DIR"))
	if dir == "" {
		return errors.New("GADFLY_CONSOLIDATE_DIR is empty")
	}
	entries, err := os.ReadDir(dir)
	if err != nil {
		return fmt.Errorf("read consolidate dir: %w", err)
	}
	var models []modelFindings
	for _, e := range entries {
		if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
			continue
		}
		data, err := os.ReadFile(filepath.Join(dir, e.Name()))
		if err != nil {
			fmt.Fprintln(os.Stderr, "gadfly: read", e.Name(), err)
			continue
		}
		var mf modelFindings
		if err := json.Unmarshal(data, &mf); err != nil {
			fmt.Fprintln(os.Stderr, "gadfly: parse", e.Name(), err)
			continue
		}
		if strings.TrimSpace(mf.Model) == "" {
			continue
		}
		models = append(models, mf)
	}
	if len(models) == 0 {
		return errors.New("no model findings to consolidate")
	}
	// Cluster once, then render the consensus comment and (best-effort) the inline
	// PR review from the same clusters so the two views can't drift.
	clusters := clusterFindings(models)

	// Lead with the marker so entrypoint.sh can upsert this comment in place
	// (same pattern as run.sh's per-model marker); it appends the advisory footer.
	fmt.Println(consensusMarker)
	fmt.Println(renderConsensus(models, clusters))

	// Inline PR review (COMMENT state) anchoring findings to changed lines.
	// Best-effort: no-op without diff/API creds, never affects stdout/exit.
	postInlineReview(clusters)
	return nil
}

// cluster is a group of findings (across models) judged to be the same issue:
// same file, lines within lineTolerance of the cluster's current span. The span
// [line,maxLine] slides as members join, so a chain of nearby findings merges
// instead of splitting once it drifts past the first line.
type cluster struct {
	file     string
	line     int // representative (smallest) line
	maxLine  int // largest line in the cluster — the span's upper edge
	severity string
	title    string
	detail   string // highest-severity report's detail; rendered in the inline review comment
	models   map[string]bool
	lenses   map[string]bool
}

// findingRef is one model's finding (carrying which model reported it), used
// while grouping findings into clusters.
type findingRef struct {
	f     outFinding
	model string
}

// lineTolerance: a finding in the same file within this many lines of a
// cluster's current span is treated as the same issue (models often cite a line
// or two apart).
const lineTolerance = 3

// renderConsensus builds the single consolidated comment body from every model's
// findings. It does NOT emit the marker or advisory footer — entrypoint.sh wraps
// it (mirroring run.sh's per-model framing).
func renderConsensus(models []modelFindings, clusters []cluster) string {
	// effective = models that actually produced a review. Errored models are
	// shown (folded, below) but excluded from the agreement denominator so a
	// failed model doesn't dilute every ratio.
	effective := 0
	for _, m := range models {
		if !m.Errored {
			effective++
		}
	}
	errored := len(models) - effective

	worst := verdictClean
	for _, m := range models {
		if v := parseVerdict(m.Verdict); v > worst {
			worst = v
		}
	}

	// Partition in one pass: "headline" findings (multi-model agreement, OR a
	// lone CRITICAL) vs folded "single-model" lower-confidence findings. Also
	// count multi-model agreements for the summary line.
	var headline, folded []cluster
	agreed := 0
	for _, c := range clusters {
		if len(c.models) >= 2 {
			agreed++
		}
		if len(c.models) >= 2 || sevRank(c.severity) >= sevRank("critical") {
			headline = append(headline, c)
		} else {
			folded = append(folded, c)
		}
	}

	var b strings.Builder
	fmt.Fprintf(&b, "## 🪰 Gadfly review — consensus across %d model%s", effective, plural(effective))
	if errored > 0 {
		fmt.Fprintf(&b, " (%d failed)", errored)
	}
	b.WriteString("\n\n")
	fmt.Fprintf(&b, "**Verdict: %s** · %d finding%s (%d with multi-model agreement)\n",
		worst.label(), len(clusters), plural(len(clusters)), agreed)

	if len(headline) > 0 {
		b.WriteString("\n| | Finding | Where | Models | Lens |\n|--|--|--|--|--|\n")
		for _, c := range headline {
			fmt.Fprintf(&b, "| %s | %s | `%s` | %d/%d | %s |\n",
				sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
				len(c.models), effective, mdCell(lensList(c.lenses)))
		}
	} else if len(clusters) == 0 {
		b.WriteString("\nNo material issues found by consensus.\n")
	}
	// else: only single-model findings — they're shown folded below, so don't
	// claim "no material issues" (there are some, just none with consensus).

	if len(folded) > 0 {
		fmt.Fprintf(&b, "\n<details><summary>%d single-model finding%s (lower confidence)</summary>\n\n",
			len(folded), plural(len(folded)))
		b.WriteString("| | Finding | Where | Model | Lens |\n|--|--|--|--|--|\n")
		for _, c := range folded {
			fmt.Fprintf(&b, "| %s | %s | `%s` | %s | %s |\n",
				sevIcon(c.severity), mdCell(c.title), mdCell(location(c.file, c.line)),
				mdCell(oneModel(c.models)), mdCell(lensList(c.lenses)))
		}
		b.WriteString("\n</details>\n")
	}

	// Per-model full reviews, folded for drill-down (nothing is lost).
	b.WriteString("\n<details><summary>Per-model detail</summary>\n")
	for _, m := range models {
		body := strings.TrimSpace(m.Markdown)
		if body == "" {
			body = "_(no output)_"
		}
		verdict := m.Verdict
		if m.Errored {
			verdict = "⚠️ reviewer failed"
		}
		fmt.Fprintf(&b, "\n<details><summary><b>%s</b> (%s) — %s</summary>\n\n%s\n\n</details>\n",
			mdCell(m.Model), mdCell(m.Provider), verdict, body)
	}
	b.WriteString("\n</details>")
	return b.String()
}

// clusterFindings groups every model's findings into cross-model clusters,
// sorted by agreement (desc), then severity (desc), then location.
func clusterFindings(models []modelFindings) []cluster {
	// Group by file, then greedily merge by line proximity.
	byFile := map[string][]findingRef{}
	for _, m := range models {
		for _, f := range m.Findings {
			if strings.TrimSpace(f.File) == "" {
				continue
			}
			byFile[f.File] = append(byFile[f.File], findingRef{f, m.Model})
		}
	}

	var clusters []cluster
	for file, items := range byFile {
		sort.SliceStable(items, func(i, j int) bool { return items[i].f.Line < items[j].f.Line })
		// Cluster within THIS file only (clusters never span files), so the inner
		// scan is over same-file clusters, not every cluster seen so far.
		var fileClusters []cluster
		for _, it := range items {
			placed := false
			for ci := range fileClusters {
				c := &fileClusters[ci]
				// Join if the line falls within the cluster's span, widened by the
				// tolerance on both edges — so the window slides as the span grows.
				if it.f.Line >= c.line-lineTolerance && it.f.Line <= c.maxLine+lineTolerance {
					mergeIntoCluster(c, it.f, it.model)
					placed = true
					break
				}
			}
			if !placed {
				c := cluster{
					file:     file,
					line:     it.f.Line,
					maxLine:  it.f.Line,
					severity: it.f.Severity,
					title:    it.f.Title,
					detail:   it.f.Detail,
					models:   map[string]bool{},
					lenses:   map[string]bool{},
				}
				mergeIntoCluster(&c, it.f, it.model)
				fileClusters = append(fileClusters, c)
			}
		}
		clusters = append(clusters, fileClusters...)
	}

	sort.SliceStable(clusters, func(i, j int) bool {
		if len(clusters[i].models) != len(clusters[j].models) {
			return len(clusters[i].models) > len(clusters[j].models)
		}
		if sevRank(clusters[i].severity) != sevRank(clusters[j].severity) {
			return sevRank(clusters[i].severity) > sevRank(clusters[j].severity)
		}
		if clusters[i].file != clusters[j].file {
			return clusters[i].file < clusters[j].file
		}
		return clusters[i].line < clusters[j].line
	})
	return clusters
}

// mergeIntoCluster folds one finding into a cluster: union the model/lens sets,
// widen the [line,maxLine] span, and keep the highest-severity report's title.
func mergeIntoCluster(c *cluster, f outFinding, model string) {
	if model != "" {
		c.models[model] = true
	}
	if f.Lens != "" {
		c.lenses[f.Lens] = true
	}
	if f.Line > 0 && (c.line == 0 || f.Line < c.line) {
		c.line = f.Line
	}
	if f.Line > c.maxLine {
		c.maxLine = f.Line
	}
	// Backfill an empty title/detail from any report, regardless of severity, so a
	// higher-severity-but-terse finding doesn't leave the cluster without context.
	if strings.TrimSpace(c.title) == "" && strings.TrimSpace(f.Title) != "" {
		c.title = f.Title
	}
	if strings.TrimSpace(c.detail) == "" && strings.TrimSpace(f.Detail) != "" {
		c.detail = f.Detail
	}
	// A strictly-higher-severity report takes over the title/detail.
	if sevRank(f.Severity) > sevRank(c.severity) {
		c.severity = f.Severity
		if strings.TrimSpace(f.Title) != "" {
			c.title = f.Title
		}
		if strings.TrimSpace(f.Detail) != "" {
			c.detail = f.Detail
		}
	}
}

// sevRank orders the canonical severity words for sorting/comparison.
func sevRank(s string) int {
	switch strings.ToLower(strings.TrimSpace(s)) {
	case "critical":
		return 5
	case "high":
		return 4
	case "medium":
		return 3
	case "small":
		return 2
	case "trivial":
		return 1
	default:
		return 0
	}
}

// sevIcon is the at-a-glance severity badge for the consensus table.
func sevIcon(s string) string {
	switch strings.ToLower(strings.TrimSpace(s)) {
	case "critical", "high":
		return "🔴"
	case "medium":
		return "🟠"
	case "small":
		return "🟡"
	default:
		return "⚪"
	}
}

func location(file string, line int) string {
	if line > 0 {
		return fmt.Sprintf("%s:%d", file, line)
	}
	return file
}

func lensList(lenses map[string]bool) string {
	out := make([]string, 0, len(lenses))
	for l := range lenses {
		out = append(out, l)
	}
	sort.Strings(out)
	return strings.Join(out, ", ")
}

func oneModel(models map[string]bool) string {
	for m := range models {
		return m
	}
	return ""
}

// mdCell makes a string safe for a one-line markdown table cell: collapse
// newlines, escape pipes (which delimit columns), and neutralize backticks
// (a stray one would break an inline-code span — a backslash can't escape it
// inside code, so replace with an apostrophe). Inputs are model-influenced, so
// this keeps a malformed file path or title from breaking the table.
func mdCell(s string) string {
	s = strings.ReplaceAll(s, "\n", " ")
	s = strings.ReplaceAll(s, "|", "\\|")
	s = strings.ReplaceAll(s, "`", "'")
	return strings.TrimSpace(s)
}

func plural(n int) string {
	if n == 1 {
		return ""
	}
	return "s"
}