Files
executus/examples/reviewer/reviewer.go
T
steve a87e7d2c72
executus CI / test (pull_request) Failing after 3s
executus CI / test (push) Failing after 1m9s
fix: address verified gadfly P5 findings (canary robustness)
All 3 cloud models converged (all "minor" — example code, no blocking):
- Consolidate: a model whose every lens errored now reads "review incomplete",
  not a misleading "no issues found" (all 3 models). + test.
- Consolidate: swarm-cancelled (unattributed) cells now surface a "swarm
  cancelled — N cell(s) did not run" banner instead of vanishing (all 3). + test.
- main: io.ReadAll(os.Stdin) error is surfaced (all 3); a TTY stdin no longer
  hangs forever (TTY guard, minimax).
- providerOf: a bare tier name now keys its own PerKey bucket instead of all
  bare tiers collapsing onto "tier" (minimax, glm-5.2) — distinct tiers throttle
  independently.
- Review doc reworded (the closure, not fanout, carries per-cell errors).

Left as documented example-scope behavior: no per-cell timeout (caller supplies
ctx), unknown-severity → lowest rank (no crash).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 00:34:01 -04:00

205 lines
6.4 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Command reviewer is executus's light-tier CANARY: a gadfly-shaped adversarial
// PR reviewer built on the executus CORE ONLY — no batteries, no DB, no host.
// It proves the core is sufficient for a static-binary host like gadfly:
//
// - config.Env → env-driven model fleet + concurrency (GADFLY_*-style)
// - model.Configure/... → tier resolution + failover over majordomo
// - fanout.Run → the N-models × M-lenses swarm, with per-provider caps
// - model.GenerateWith[T] → structured findings per (model, lens)
// - consolidation → one report section per model, worst-verdict-led
//
// The whole thing imports only executus core packages, so a binary built from it
// keeps a go.sum free of gorm/redis/discordgo/sqlite — the light-tier invariant.
//
// See reviewer_test.go for the hermetic swarm test (majordomo's fake provider).
package main
import (
"context"
"fmt"
"sort"
"strings"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/executus/fanout"
"gitea.stevedudenhoeffer.com/steve/executus/model"
)
// Severity orders findings; the rank drives a model's worst-verdict header.
type Severity string
const (
SevTrivial Severity = "trivial"
SevSmall Severity = "small"
SevMedium Severity = "medium"
SevHigh Severity = "high"
SevCritical Severity = "critical"
)
func severityRank(s Severity) int {
switch s {
case SevCritical:
return 4
case SevHigh:
return 3
case SevMedium:
return 2
case SevSmall:
return 1
default:
return 0
}
}
// Finding is one issue a lens reports. It is the structured-output schema the
// model must satisfy (majordomo derives the JSON schema from this struct).
type Finding struct {
Severity Severity `json:"severity" jsonschema:"enum=trivial,enum=small,enum=medium,enum=high,enum=critical"`
Title string `json:"title"`
Detail string `json:"detail"`
}
// lensReport is the per-(model,lens) structured response.
type lensReport struct {
Findings []Finding `json:"findings"`
}
// Lens is one review dimension (security / correctness / …).
type Lens struct {
Name string
Focus string // appended to the base system prompt
}
// NamedModel is a resolved model plus the label + provider used for fan-out
// keying (per-provider concurrency) and reporting.
type NamedModel struct {
Name string // display label (the tier/spec the host configured)
Provider string // fan-out key for PerKey concurrency (e.g. "ollama-cloud")
Model llm.Model
}
// LensResult is one swarm cell's outcome.
type LensResult struct {
Model string
Lens string
Findings []Finding
Err error
}
const baseSystemPrompt = "You are an adversarial code reviewer. Review the diff for real, verifiable problems only — no style nits. Return ONLY JSON matching the schema. Report nothing if you find nothing."
// Review runs every (model × lens) cell of the swarm concurrently, bounded by
// opts (total + per-provider caps), and returns one LensResult per cell. A cell
// whose model call fails carries the error in LensResult.Err — one bad cell
// never aborts the swarm (the closure embeds per-cell errors in LensResult.Err).
func Review(ctx context.Context, models []NamedModel, lenses []Lens, diff string, opts fanout.Options[cell]) []LensResult {
cells := make([]cell, 0, len(models)*len(lenses))
for _, m := range models {
for _, l := range lenses {
cells = append(cells, cell{model: m, lens: l})
}
}
// Key each cell by its provider so PerKey throttles per backend (the
// GADFLY_PROVIDER_CONCURRENCY analogue).
if opts.Key == nil {
opts.Key = func(c cell) string { return c.model.Provider }
}
results := fanout.Run(ctx, cells, opts, func(ctx context.Context, c cell) (LensResult, error) {
sys := baseSystemPrompt
if c.lens.Focus != "" {
sys += "\n\nLens — " + c.lens.Name + ": " + c.lens.Focus
}
msgs := []llm.Message{{Role: llm.RoleUser, Parts: []llm.Part{llm.Text("Diff under review:\n" + diff)}}}
rep, err := model.GenerateWith[lensReport](ctx, c.model.Model, sys, msgs)
lr := LensResult{Model: c.model.Name, Lens: c.lens.Name, Findings: rep.Findings, Err: err}
// Return the value either way (err embedded) so every cell reports.
return lr, nil
})
out := make([]LensResult, 0, len(results))
for _, r := range results {
if r.Err != nil { // a swarm-level error (ctx cancel) with no value
out = append(out, LensResult{Err: r.Err})
continue
}
out = append(out, r.Value)
}
return out
}
// cell is one (model, lens) swarm task.
type cell struct {
model NamedModel
lens Lens
}
// Consolidate renders the swarm's results into one report: a section per model,
// each led by that model's worst finding severity, mirroring gadfly's
// one-comment-per-model output.
func Consolidate(results []LensResult) string {
byModel := map[string][]LensResult{}
var order []string
aborted := 0 // cells dropped before running (swarm cancelled) — no model attribution
for _, r := range results {
if r.Model == "" {
if r.Err != nil {
aborted++
}
continue
}
if _, ok := byModel[r.Model]; !ok {
order = append(order, r.Model)
}
byModel[r.Model] = append(byModel[r.Model], r)
}
sort.Strings(order)
var b strings.Builder
if aborted > 0 {
fmt.Fprintf(&b, "> ⚠ swarm cancelled — %d cell(s) did not run; results below are partial.\n\n", aborted)
}
for _, m := range order {
rs := byModel[m]
var all []Finding
worst := -1
errored := 0
for _, r := range rs {
if r.Err != nil {
errored++
continue
}
all = append(all, r.Findings...)
for _, f := range r.Findings {
if severityRank(f.Severity) > worst {
worst = severityRank(f.Severity)
}
}
}
// A model whose every lens errored produced NO data — saying "no issues
// found" would be misleading, so it gets its own verdict.
successful := len(rs) - errored
verdict := "no issues found"
switch {
case successful == 0 && errored > 0:
verdict = "review incomplete"
case worst >= severityRank(SevHigh):
verdict = "blocking issues found"
case worst >= 0:
verdict = "minor issues"
}
fmt.Fprintf(&b, "## %s — %s", m, verdict)
if errored > 0 {
fmt.Fprintf(&b, " (⚠ %d lens(es) errored)", errored)
}
b.WriteString("\n")
sort.SliceStable(all, func(i, j int) bool {
return severityRank(all[i].Severity) > severityRank(all[j].Severity)
})
for _, f := range all {
fmt.Fprintf(&b, "- [%s] %s — %s\n", f.Severity, f.Title, f.Detail)
}
b.WriteString("\n")
}
return b.String()
}