feat: specialist suite — configurable + custom review lenses (one consolidated comment)
Build & push image / build-and-push (push) Successful in 8s

Replace the single generic review with a suite of focused specialists, each its
own review+recheck pass, merged into ONE comment (a collapsible section per lens,
led by the worst verdict; the optional `improvements` lens never escalates it).

- cmd/gadfly/specialists.go: built-in lenses + default suite (security, correctness,
  maintainability, performance, error-handling) + opt-in (tests, docs, conventions,
  improvements). Selection via GADFLY_SPECIALISTS (csv/"all"); custom defs via
  GADFLY_SPECIALIST_<NAME> env and a repo .gadfly.yml (specialists + define).
  Precedence: built-ins < file < env. Unknown names error but don't sink the run.
- cmd/gadfly/consolidate.go: verdict parse + one-comment render.
- main.go: loop specialists; per-lens failure is an inline notice, never fatal.
  Default timeout bumped to 600s (suite runs sequentially).
- base system prompt trimmed to persona+tools+discipline+output; lens-specific
  focus is appended per specialist (semantic re-derivation discipline kept in base).
- entrypoint default models -> single model (suite already gives breadth; cost ~=
  specialists × models × 2). Adds gopkg.in/yaml.v3.
- docs/examples: README "Specialists" section, examples/.gadfly.yml, stub var,
  CLAUDE.md architecture/config. Dynamic `auto` selection is the planned next step.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Steve Dudenhoeffer
2026-06-25 19:23:05 -04:00
parent 676c9d4f07
commit 7809d1b93d
13 changed files with 581 additions and 38 deletions
+94
View File
@@ -0,0 +1,94 @@
package main
import (
"fmt"
"strings"
)
// verdict is the severity a specialist (or the whole review) lands on.
type verdict int
const (
verdictUnknown verdict = iota // couldn't parse; treated below Minor for display, not blocking
verdictClean // "No material issues found"
verdictMinor // "Minor issues"
verdictBlocking // "Blocking issues found"
)
func (v verdict) label() string {
switch v {
case verdictClean:
return "No material issues found"
case verdictMinor:
return "Minor issues"
case verdictBlocking:
return "Blocking issues found"
default:
return "Reviewed"
}
}
// parseVerdict extracts a specialist's self-reported verdict from its output.
// The base prompt tells each lens to lead with one of the three phrases.
func parseVerdict(out string) verdict {
l := strings.ToLower(out)
switch {
case strings.Contains(l, "blocking issues found"):
return verdictBlocking
case strings.Contains(l, "no material issues"):
return verdictClean
case strings.Contains(l, "minor issues"):
return verdictMinor
default:
return verdictUnknown
}
}
// specialistResult pairs a specialist with its rendered review and verdict.
type specialistResult struct {
spec Specialist
out string
verdict verdict
}
// worstVerdict returns the most severe verdict across results. The optional
// "improvements" lens never escalates the overall verdict (it's advisory).
func worstVerdict(results []specialistResult) verdict {
worst := verdictClean
for _, r := range results {
if r.spec.Name == "improvements" {
continue
}
if r.verdict > worst {
worst = r.verdict
}
}
return worst
}
// renderConsolidated assembles the single comment body: an overall verdict line
// followed by one verbatim section per specialist. run.sh wraps this with the
// "🪰 Gadfly review — <model>" header and the advisory footer.
func renderConsolidated(results []specialistResult) string {
var b strings.Builder
fmt.Fprintf(&b, "**Verdict: %s** — %d reviewers: %s\n",
worstVerdict(results).label(), len(results), strings.Join(specialistNames(results), ", "))
for _, r := range results {
body := strings.TrimSpace(r.out)
if body == "" {
body = "_(no output)_"
}
fmt.Fprintf(&b, "\n<details><summary><b>%s</b> — %s</summary>\n\n%s\n\n</details>\n",
r.spec.Title, r.verdict.label(), body)
}
return strings.TrimRight(b.String(), "\n")
}
func specialistNames(results []specialistResult) []string {
names := make([]string, 0, len(results))
for _, r := range results {
names = append(names, r.spec.Name)
}
return names
}
+36 -15
View File
@@ -65,8 +65,10 @@ import (
)
const (
defaultMaxSteps = 24
defaultTimeoutSecs = 300
defaultMaxSteps = 24
// defaultTimeoutSecs is the overall deadline shared by ALL specialist passes
// in one invocation; the default suite runs several lenses sequentially.
defaultTimeoutSecs = 600
defaultMaxDiffChars = 60000
// defaultWrapUpReserve is how many steps before the cap the agent is told
// to stop investigating and write its final answer. Reserving a margin is
@@ -130,34 +132,53 @@ func run() error {
return fmt.Errorf("resolve model: %w", err)
}
specialists, serrs := resolveSpecialists(repoDir)
for _, e := range serrs {
fmt.Fprintln(os.Stderr, "gadfly:", e)
}
if len(specialists) == 0 {
return errors.New("no specialists resolved to run")
}
timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// Pass 1 — review: produce the draft.
draft, err := runAgent(ctx, mdl, fsTools, string(systemBytes), buildTask(diff),
envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
if err != nil {
return fmt.Errorf("review pass: %w", err)
base := string(systemBytes)
task := buildTask(diff)
results := make([]specialistResult, 0, len(specialists))
for _, sp := range specialists {
out := reviewWithSpecialist(ctx, mdl, fsTools, base, sp, task, diff)
results = append(results, specialistResult{spec: sp, out: out, verdict: parseVerdict(out)})
}
fmt.Println(renderConsolidated(results))
return nil
}
// reviewWithSpecialist runs one lens end-to-end: a review pass under the
// specialist's composed prompt, then the shared adversarial recheck pass. A
// failed pass is rendered as an inline notice (advisory — one lens failing
// never sinks the others or the job).
func reviewWithSpecialist(ctx context.Context, mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) string {
draft, err := runAgent(ctx, mdl, fsTools, composeSpecialistPrompt(base, sp), task,
envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
if err != nil {
fmt.Fprintf(os.Stderr, "gadfly: specialist %q review pass failed: %v\n", sp.Name, err)
return fmt.Sprintf("⚠️ This reviewer failed to complete: %v", err)
}
// Pass 2 — recheck: adversarially re-verify the draft's findings and drop
// the unconfirmed ones. Skipped for a clean draft (nothing to verify) or
// when disabled. A recheck failure is non-fatal — we emit the unverified
// draft rather than losing the review entirely.
final := draft
if shouldRecheck(draft) {
rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff),
envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps))
if rerr != nil {
fmt.Fprintln(os.Stderr, "gadfly: recheck pass failed; emitting unverified draft:", rerr)
fmt.Fprintf(os.Stderr, "gadfly: specialist %q recheck failed; emitting unverified draft: %v\n", sp.Name, rerr)
} else {
final = rechecked
}
}
fmt.Println(final)
return nil
return final
}
// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
+219
View File
@@ -0,0 +1,219 @@
package main
import (
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"gopkg.in/yaml.v3"
)
// Specialist is one review lens: a focused instruction appended to the base
// system prompt, run as its own review (+recheck) pass and rendered as its own
// section in the consolidated comment.
type Specialist struct {
Name string // lowercase id, used in GADFLY_SPECIALISTS and section anchors
Title string // human label for the section header
Focus string // lens-specific instruction appended to the base prompt
}
// builtinSpecialists are the lenses Gadfly ships with. The default suite (below)
// is the "biggies"; the rest are opt-in by name via GADFLY_SPECIALISTS, a
// .gadfly.yml `specialists:` list, or `all`.
var builtinSpecialists = map[string]Specialist{
"security": {"security", "🔒 Security", "" +
"Authn/authz gaps, injection (SQL/command/path/template), SSRF, unsafe deserialization, " +
"secret/credential leakage, missing input validation, unsafe handling of untrusted data, " +
"and insecure defaults. Trace tainted input to where it's used."},
"correctness": {"correctness", "✅ Correctness", "" +
"Logic bugs and incorrect behavior introduced by the change. Pay special attention to " +
"SEMANTIC / domain correctness: do NOT trust a constant, conversion factor, formula, unit, " +
"or threshold because it looks reasonable — independently RE-DERIVE the expected value from " +
"first principles (units, dimensions, boundary values) and compare. A plausible-looking magic " +
"number is exactly where real bugs hide."},
"maintainability": {"maintainability", "🧹 Code cleanliness & maintainability", "" +
"Readability and structure: dead/duplicated code, confusing names, overly long or deeply " +
"nested functions, leaky abstractions, copy-paste that should be shared, and changes that " +
"don't follow the patterns the surrounding code already uses. Prefer concrete, low-churn fixes."},
"performance": {"performance", "⚡ Performance", "" +
"Efficiency regressions: N+1 queries, unnecessary allocations or copies, work inside hot loops, " +
"unbounded growth, missing pagination/limits, blocking calls on hot paths, and avoidable " +
"quadratic behavior. Only flag impact you can justify, not micro-optimizations."},
"error-handling": {"error-handling", "🧯 Error handling & edge cases", "" +
"Ignored or swallowed errors, missing cleanup/rollback/`defer`, panics on bad input, and " +
"unhandled edge cases: nil/null, empty collections, zero/negative, integer overflow, and " +
"boundary (off-by-one) conditions. Check the unhappy paths the diff introduces."},
// --- opt-in (not in the default suite) ---
"tests": {"tests", "🧪 Tests", "" +
"Whether the change is adequately tested: new logic without tests, changed behavior whose " +
"tests weren't updated, missing edge-case/error-path coverage, and brittle or assertion-free tests."},
"docs": {"docs", "📝 Docs", "" +
"Documentation drift: public APIs/flags/config changed without updating doc comments, README, " +
"or examples; stale comments now contradicting the code; missing docs for new exported surface."},
"conventions": {"conventions", "📐 Conventions", "" +
"Adherence to THIS repo's own conventions. Discover them — read any README / CONTRIBUTING / " +
"CLAUDE.md / AGENTS.md / lint config the repo ships, and hold the change to the documented and " +
"surrounding-code patterns."},
"improvements": {"improvements", "💡 Improvements (optional)", "" +
"OPTIONAL, non-blocking suggestions only. Surface at most 12 genuinely high-value blind spots " +
"or simplifications the author likely wants to know about. Be quiet: if nothing rises to that " +
"bar, reply exactly 'No suggestions.' Never nitpick, never pad. These never affect the verdict."},
}
// defaultSuite is used when nothing selects specialists explicitly.
var defaultSuite = []string{"security", "correctness", "maintainability", "performance", "error-handling"}
// fileConfig is the optional .gadfly.yml schema.
type fileConfig struct {
// Specialists selects which run (names). Empty => the default suite.
Specialists []string `yaml:"specialists"`
// Define adds or overrides specialist definitions.
Define []struct {
Name string `yaml:"name"`
Title string `yaml:"title"`
Focus string `yaml:"focus"`
} `yaml:"define"`
}
// resolveSpecialists builds the active specialist list from (in increasing
// precedence) the built-ins, a repo .gadfly.yml, and GADFLY_SPECIALIST_<NAME>
// env definitions. The active SELECTION is GADFLY_SPECIALISTS (csv or "all"),
// else the file's `specialists:`, else the default suite. Unknown names are
// reported as errors and skipped; valid ones still run.
func resolveSpecialists(repoDir string) ([]Specialist, []error) {
var errs []error
// 1. registry seeded with built-ins.
registry := make(map[string]Specialist, len(builtinSpecialists))
for k, v := range builtinSpecialists {
registry[k] = v
}
// 2. config file: definitions + (maybe) the selection.
var fileSelection []string
if cfg, ok, err := loadFileConfig(repoDir); err != nil {
errs = append(errs, err)
} else if ok {
for _, d := range cfg.Define {
name := strings.ToLower(strings.TrimSpace(d.Name))
if name == "" || strings.TrimSpace(d.Focus) == "" {
errs = append(errs, fmt.Errorf(".gadfly.yml: specialist define needs non-empty name and focus"))
continue
}
registry[name] = Specialist{Name: name, Title: titleOr(d.Title, name), Focus: d.Focus}
}
fileSelection = cfg.Specialists
}
// 3. env definitions: GADFLY_SPECIALIST_<NAME>="focus".
for _, kv := range os.Environ() {
key, val, _ := strings.Cut(kv, "=")
if !strings.HasPrefix(key, "GADFLY_SPECIALIST_") || len(key) == len("GADFLY_SPECIALIST_") {
continue
}
name := strings.ToLower(strings.TrimPrefix(key, "GADFLY_SPECIALIST_"))
if strings.TrimSpace(val) == "" {
errs = append(errs, fmt.Errorf("%s: empty specialist focus", key))
continue
}
existing, ok := registry[name]
title := titleOr("", name)
if ok {
title = existing.Title
}
registry[name] = Specialist{Name: name, Title: title, Focus: val}
}
// 4. selection: env > file > default suite.
var names []string
switch {
case strings.TrimSpace(os.Getenv("GADFLY_SPECIALISTS")) != "":
names = splitCSV(os.Getenv("GADFLY_SPECIALISTS"))
case len(fileSelection) > 0:
names = fileSelection
default:
names = defaultSuite
}
if len(names) == 1 && strings.EqualFold(names[0], "all") {
names = sortedKeys(registry)
}
// 5. resolve names -> specialists.
var out []Specialist
seen := map[string]bool{}
for _, raw := range names {
name := strings.ToLower(strings.TrimSpace(raw))
if name == "" || seen[name] {
continue
}
seen[name] = true
sp, ok := registry[name]
if !ok {
errs = append(errs, fmt.Errorf("unknown specialist %q (built-ins: %s; or define it via .gadfly.yml / GADFLY_SPECIALIST_%s)",
name, strings.Join(sortedKeys(builtinSpecialists), ", "), strings.ToUpper(name)))
continue
}
out = append(out, sp)
}
return out, errs
}
// composeSpecialistPrompt appends a specialist's lens to the base system prompt.
func composeSpecialistPrompt(base string, sp Specialist) string {
return strings.TrimRight(base, "\n") +
"\n\n## Your review lens: " + sp.Title + "\n" +
"Focus your review on this lens; other reviewers cover the rest. " + sp.Focus
}
func loadFileConfig(repoDir string) (fileConfig, bool, error) {
candidates := []string{}
if p := strings.TrimSpace(os.Getenv("GADFLY_CONFIG")); p != "" {
if !filepath.IsAbs(p) {
p = filepath.Join(repoDir, p)
}
candidates = append(candidates, p)
} else {
candidates = append(candidates, filepath.Join(repoDir, ".gadfly.yml"), filepath.Join(repoDir, ".gadfly.yaml"))
}
for _, p := range candidates {
data, err := os.ReadFile(p)
if err != nil {
continue // not found / unreadable: try next
}
var cfg fileConfig
if err := yaml.Unmarshal(data, &cfg); err != nil {
return fileConfig{}, false, fmt.Errorf("parse %s: %w", filepath.Base(p), err)
}
return cfg, true, nil
}
return fileConfig{}, false, nil
}
func titleOr(title, name string) string {
if strings.TrimSpace(title) != "" {
return title
}
return name
}
func splitCSV(s string) []string {
var out []string
for _, p := range strings.Split(s, ",") {
if t := strings.TrimSpace(p); t != "" {
out = append(out, t)
}
}
return out
}
func sortedKeys(m map[string]Specialist) []string {
keys := make([]string, 0, len(m))
for k := range m {
keys = append(keys, k)
}
sort.Strings(keys)
return keys
}
+118
View File
@@ -0,0 +1,118 @@
package main
import (
"os"
"path/filepath"
"testing"
)
func names(specs []Specialist) []string {
out := make([]string, len(specs))
for i, s := range specs {
out[i] = s.Name
}
return out
}
func eq(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
func TestResolveSpecialists_DefaultSuite(t *testing.T) {
t.Setenv("GADFLY_SPECIALISTS", "")
specs, errs := resolveSpecialists(t.TempDir())
if len(errs) != 0 {
t.Fatalf("unexpected errors: %v", errs)
}
if !eq(names(specs), defaultSuite) {
t.Errorf("default = %v, want %v", names(specs), defaultSuite)
}
}
func TestResolveSpecialists_EnvSelection(t *testing.T) {
t.Setenv("GADFLY_SPECIALISTS", "security, tests")
specs, errs := resolveSpecialists(t.TempDir())
if len(errs) != 0 {
t.Fatalf("unexpected errors: %v", errs)
}
if !eq(names(specs), []string{"security", "tests"}) {
t.Errorf("got %v", names(specs))
}
}
func TestResolveSpecialists_UnknownNameErrors(t *testing.T) {
t.Setenv("GADFLY_SPECIALISTS", "security,bogus")
specs, errs := resolveSpecialists(t.TempDir())
if len(errs) == 0 {
t.Fatal("expected an error for unknown specialist")
}
if !eq(names(specs), []string{"security"}) {
t.Errorf("valid ones should still resolve, got %v", names(specs))
}
}
func TestResolveSpecialists_EnvCustomDefinition(t *testing.T) {
t.Setenv("GADFLY_SPECIALIST_MIGRATIONS", "Review DB migrations for destructive ops.")
t.Setenv("GADFLY_SPECIALISTS", "migrations")
specs, errs := resolveSpecialists(t.TempDir())
if len(errs) != 0 {
t.Fatalf("unexpected errors: %v", errs)
}
if len(specs) != 1 || specs[0].Name != "migrations" || specs[0].Focus == "" {
t.Fatalf("custom specialist not registered: %+v", specs)
}
}
func TestResolveSpecialists_FileConfig(t *testing.T) {
dir := t.TempDir()
cfg := `specialists: [security, migrations]
define:
- name: migrations
title: "DB migrations"
focus: "Review schema migrations for destructive or unindexed changes."
`
if err := os.WriteFile(filepath.Join(dir, ".gadfly.yml"), []byte(cfg), 0o644); err != nil {
t.Fatal(err)
}
t.Setenv("GADFLY_SPECIALISTS", "") // let the file drive selection
specs, errs := resolveSpecialists(dir)
if len(errs) != 0 {
t.Fatalf("unexpected errors: %v", errs)
}
if !eq(names(specs), []string{"security", "migrations"}) {
t.Errorf("got %v", names(specs))
}
}
func TestParseVerdictAndWorst(t *testing.T) {
cases := map[string]verdict{
"VERDICT: No material issues found.": verdictClean,
"Minor issues\n- nit": verdictMinor,
"**Blocking issues found**": verdictBlocking,
"something unparseable": verdictUnknown,
}
for in, want := range cases {
if got := parseVerdict(in); got != want {
t.Errorf("parseVerdict(%q) = %v, want %v", in, got, want)
}
}
results := []specialistResult{
{spec: Specialist{Name: "security"}, verdict: verdictMinor},
{spec: Specialist{Name: "correctness"}, verdict: verdictBlocking},
{spec: Specialist{Name: "improvements"}, verdict: verdictBlocking}, // must not count
}
if w := worstVerdict(results[:2]); w != verdictBlocking {
t.Errorf("worst = %v, want blocking", w)
}
if w := worstVerdict([]specialistResult{results[0], results[2]}); w != verdictMinor {
t.Errorf("improvements should not escalate; worst = %v, want minor", w)
}
}