From ea9475da5451337a4364ab418513f2cef42f911d Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 00:22:02 -0400 Subject: [PATCH] =?UTF-8?q?P5:=20light-tier=20canary=20=E2=80=94=20gadfly-?= =?UTF-8?q?shaped=20reviewer=20on=20executus=20core?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit examples/reviewer proves the core is sufficient for a static-binary light host (gadfly's shape) with NO batteries: - config.Env + model.Configure -> env-driven model fleet + tier overrides - model.ParseModelForContext -> tier resolution + failover - fanout.Run (PerKey caps) -> N models x M lenses swarm, per-provider bound - model.GenerateWith[T] -> structured findings per (model, lens) cell - Consolidate -> one verdict-led report section per model Hermetic test runs the full 2x3 swarm against majordomo's fake provider and asserts the consolidated verdicts. A go list -deps CI check asserts the canary imports ZERO batteries (the light-tier invariant) — gadfly's go.sum stays free of gorm/redis/discordgo/sqlite. README + docs updated. This is the canary; migrating the LIVE gadfly repo onto executus core is a follow-up (kept separate to not destabilize the active reviewer). Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/ci.yml | 11 ++ CLAUDE.md | 2 +- README.md | 3 + examples/reviewer/README.md | 38 ++++++ examples/reviewer/main.go | 101 +++++++++++++++ examples/reviewer/reviewer.go | 191 +++++++++++++++++++++++++++++ examples/reviewer/reviewer_test.go | 102 +++++++++++++++ 7 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 examples/reviewer/README.md create mode 100644 examples/reviewer/main.go create mode 100644 examples/reviewer/reviewer.go create mode 100644 examples/reviewer/reviewer_test.go diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index c2b3e2c..bb32bb3 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -104,6 +104,17 @@ jobs: fi echo "OK: core go.sum is free of host/DB dependencies." + - name: Light-tier canary imports no battery + run: | + # examples/reviewer is gadfly's shape on the CORE only. If it ever + # pulls in a battery (audit/budget/persona/skill/critic/schedule/ + # checkpoint/contrib), the light path has regressed. + LEAK=$(go list -deps ./examples/reviewer/... | grep -E 'executus/(audit|budget|persona|skill|critic|schedule|checkpoint|contrib)' || true) + if [ -n "$LEAK" ]; then + echo "ERROR: light-tier canary pulled in a battery:"; echo "$LEAK"; exit 1 + fi + echo "OK: examples/reviewer is core-only." + - name: contrib/store (nested SQLite module — isolated from core) run: | # contrib/store is a SEPARATE module carrying modernc.org/sqlite; the diff --git a/CLAUDE.md b/CLAUDE.md index 7afcf4c..3da73ce 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -115,7 +115,7 @@ repackaging. P0 module + zero-coupling moves + core seams (this) → P1 tool registry + model → P2 run kernel + Ports inversion → P3 generic tools + defaults → P4 persona/skill -redesign + batteries + SQLite store → P5 gadfly on core (light-tier canary) → P6 +redesign + batteries + SQLite store → P5 gadfly-on-core canary (examples/reviewer ✓) → P6 rewire mort + tag v0.1.0. The mort-side rewrite reuses mort's existing `mort_*_adapters.go` wall as the host adapter layer. diff --git a/README.md b/README.md index 80b53d8..3cf04d5 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,9 @@ bot) — mort and gadfly are the first two consumers (heavy and light). See - `config/`, `deliver/`, `identity/` — host seams (config / output / identity), each with a shipped default. - `dispatchguard/`, `pendingattach/` — run-safety primitives. +- `examples/reviewer` — a **gadfly-shaped PR reviewer on the core only** (env-config + model fleet → `fanout` N×M swarm → `model.GenerateWith[T]` structured findings → + consolidation), the light-tier canary; CI asserts it pulls in no battery. ## Design diff --git a/examples/reviewer/README.md b/examples/reviewer/README.md new file mode 100644 index 0000000..395606e --- /dev/null +++ b/examples/reviewer/README.md @@ -0,0 +1,38 @@ +# examples/reviewer — the light-tier canary + +A **gadfly-shaped adversarial PR reviewer built on the executus core only** — no +batteries, no database, no host adapters. It exists to prove that the core is +sufficient for a static-binary light host (gadfly's shape), and that such a host +keeps a `go.sum` free of `gorm`/`redis`/`discordgo`/`sqlite`. + +What it exercises, all from core: + +| Concern | executus core piece | +|---|---| +| Env-driven model fleet + tier overrides | `config.Env` + `model.Configure` | +| Tier resolution + failover | `model.ParseModelForContext` | +| N models × M lenses swarm | `fanout.Run` (with `PerKey` per-provider caps) | +| Structured findings per cell | `model.GenerateWith[T]` | +| One report section per model, worst-verdict-led | `Consolidate` (local) | + +## Run + +```sh +REVIEWER_MODELS=fast,thinking \ +ANTHROPIC_API_KEY=sk-... \ +go run ./examples/reviewer -diff "$(git diff HEAD~1)" +``` + +Config (all optional, `REVIEWER_`-prefixed env): + +- `REVIEWER_MODELS` — csv of tier names / `provider/model` specs (default `fast`) +- `REVIEWER_MODEL_TIER_` — override a tier's resolved spec +- `REVIEWER_MAX_CONCURRENT` — total in-flight swarm cells (default 6) +- `REVIEWER_PROVIDER_CONCURRENCY` — per-provider cap (default 3) + +## Test + +`reviewer_test.go` runs the whole swarm against majordomo's fake provider +(hermetic, no network) and asserts the consolidated verdicts. A `go list -deps` +check in CI confirms the package pulls in no battery and no DB driver — the +light-tier invariant. diff --git a/examples/reviewer/main.go b/examples/reviewer/main.go new file mode 100644 index 0000000..e6f82d6 --- /dev/null +++ b/examples/reviewer/main.go @@ -0,0 +1,101 @@ +package main + +import ( + "context" + "flag" + "fmt" + "io" + "os" + "strings" + + "gitea.stevedudenhoeffer.com/steve/executus/config" + "gitea.stevedudenhoeffer.com/steve/executus/fanout" + "gitea.stevedudenhoeffer.com/steve/executus/model" +) + +// DefaultLenses is the canary's review suite (mirrors gadfly's default). +var DefaultLenses = []Lens{ + {Name: "security", Focus: "auth, injection, secret leakage, unsafe deserialization, SSRF."}, + {Name: "correctness", Focus: "logic errors, broken invariants, off-by-one, contract violations."}, + {Name: "error-handling", Focus: "swallowed errors, missing timeouts, races, unhandled edge cases."}, +} + +// Reviewer is configured entirely from the environment (the GADFLY_*-style light +// host): REVIEWER_MODELS (csv of tier/spec), REVIEWER_MODEL_TIER_ overrides, +// REVIEWER_MAX_CONCURRENT, REVIEWER_PROVIDER_CONCURRENCY. The diff is read from +// -diff or stdin. +// +// REVIEWER_MODELS=fast,thinking ANTHROPIC_API_KEY=... go run ./examples/reviewer < my.diff +func main() { + cfg := config.Env("REVIEWER_") + + // Tier table from env, with code defaults. + model.Configure(cfg, map[string]string{ + "fast": "anthropic/claude-haiku-4-5", + "thinking": "anthropic/claude-opus-4-8", + }, 0) + + fleet := splitCSV(cfg.String("models", "fast")) + maxConc := cfg.Int("max_concurrent", 6) + perProvider := cfg.Int("provider_concurrency", 3) + + diffFlag := flag.String("diff", "", "diff text to review; reads stdin when empty") + flag.Parse() + diff := *diffFlag + if strings.TrimSpace(diff) == "" { + b, _ := io.ReadAll(os.Stdin) + diff = string(b) + } + if strings.TrimSpace(diff) == "" { + fmt.Fprintln(os.Stderr, "reviewer: no diff (pass -diff or pipe one on stdin)") + os.Exit(2) + } + + ctx := context.Background() + var models []NamedModel + for _, spec := range fleet { + _, m, err := model.ParseModelForContext(ctx, spec) + if err != nil { + fmt.Fprintf(os.Stderr, "reviewer: resolve model %q: %v\n", spec, err) + os.Exit(1) + } + models = append(models, NamedModel{Name: spec, Provider: providerOf(spec), Model: m}) + } + + results := Review(ctx, models, DefaultLenses, diff, fanout.Options[cell]{ + MaxConcurrent: maxConc, + PerKey: perKeyCaps(models, perProvider), + }) + fmt.Print(Consolidate(results)) +} + +func splitCSV(s string) []string { + var out []string + for _, p := range strings.Split(s, ",") { + if p = strings.TrimSpace(p); p != "" { + out = append(out, p) + } + } + return out +} + +// providerOf returns a model spec's provider (the first path segment, e.g. +// "anthropic/claude-…" → "anthropic"; a bare tier name → "tier"). +func providerOf(spec string) string { + if i := strings.IndexByte(spec, '/'); i > 0 { + return spec[:i] + } + return "tier" +} + +// perKeyCaps builds the PerKey map: each distinct provider capped at perProvider. +func perKeyCaps(models []NamedModel, perProvider int) map[string]int { + if perProvider <= 0 { + return nil + } + caps := map[string]int{} + for _, m := range models { + caps[m.Provider] = perProvider + } + return caps +} diff --git a/examples/reviewer/reviewer.go b/examples/reviewer/reviewer.go new file mode 100644 index 0000000..75f86b7 --- /dev/null +++ b/examples/reviewer/reviewer.go @@ -0,0 +1,191 @@ +// Command reviewer is executus's light-tier CANARY: a gadfly-shaped adversarial +// PR reviewer built on the executus CORE ONLY — no batteries, no DB, no host. +// It proves the core is sufficient for a static-binary host like gadfly: +// +// - config.Env → env-driven model fleet + concurrency (GADFLY_*-style) +// - model.Configure/... → tier resolution + failover over majordomo +// - fanout.Run → the N-models × M-lenses swarm, with per-provider caps +// - model.GenerateWith[T] → structured findings per (model, lens) +// - consolidation → one report section per model, worst-verdict-led +// +// The whole thing imports only executus core packages, so a binary built from it +// keeps a go.sum free of gorm/redis/discordgo/sqlite — the light-tier invariant. +// +// See reviewer_test.go for the hermetic swarm test (majordomo's fake provider). +package main + +import ( + "context" + "fmt" + "sort" + "strings" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + + "gitea.stevedudenhoeffer.com/steve/executus/fanout" + "gitea.stevedudenhoeffer.com/steve/executus/model" +) + +// Severity orders findings; the rank drives a model's worst-verdict header. +type Severity string + +const ( + SevTrivial Severity = "trivial" + SevSmall Severity = "small" + SevMedium Severity = "medium" + SevHigh Severity = "high" + SevCritical Severity = "critical" +) + +func severityRank(s Severity) int { + switch s { + case SevCritical: + return 4 + case SevHigh: + return 3 + case SevMedium: + return 2 + case SevSmall: + return 1 + default: + return 0 + } +} + +// Finding is one issue a lens reports. It is the structured-output schema the +// model must satisfy (majordomo derives the JSON schema from this struct). +type Finding struct { + Severity Severity `json:"severity" jsonschema:"enum=trivial,enum=small,enum=medium,enum=high,enum=critical"` + Title string `json:"title"` + Detail string `json:"detail"` +} + +// lensReport is the per-(model,lens) structured response. +type lensReport struct { + Findings []Finding `json:"findings"` +} + +// Lens is one review dimension (security / correctness / …). +type Lens struct { + Name string + Focus string // appended to the base system prompt +} + +// NamedModel is a resolved model plus the label + provider used for fan-out +// keying (per-provider concurrency) and reporting. +type NamedModel struct { + Name string // display label (the tier/spec the host configured) + Provider string // fan-out key for PerKey concurrency (e.g. "ollama-cloud") + Model llm.Model +} + +// LensResult is one swarm cell's outcome. +type LensResult struct { + Model string + Lens string + Findings []Finding + Err error +} + +const baseSystemPrompt = "You are an adversarial code reviewer. Review the diff for real, verifiable problems only — no style nits. Return ONLY JSON matching the schema. Report nothing if you find nothing." + +// Review runs every (model × lens) cell of the swarm concurrently, bounded by +// opts (total + per-provider caps), and returns one LensResult per cell. A cell +// whose model call fails carries the error in LensResult.Err — one bad cell +// never aborts the swarm (fanout captures per-item errors). +func Review(ctx context.Context, models []NamedModel, lenses []Lens, diff string, opts fanout.Options[cell]) []LensResult { + cells := make([]cell, 0, len(models)*len(lenses)) + for _, m := range models { + for _, l := range lenses { + cells = append(cells, cell{model: m, lens: l}) + } + } + // Key each cell by its provider so PerKey throttles per backend (the + // GADFLY_PROVIDER_CONCURRENCY analogue). + if opts.Key == nil { + opts.Key = func(c cell) string { return c.model.Provider } + } + results := fanout.Run(ctx, cells, opts, func(ctx context.Context, c cell) (LensResult, error) { + sys := baseSystemPrompt + if c.lens.Focus != "" { + sys += "\n\nLens — " + c.lens.Name + ": " + c.lens.Focus + } + msgs := []llm.Message{{Role: llm.RoleUser, Parts: []llm.Part{llm.Text("Diff under review:\n" + diff)}}} + rep, err := model.GenerateWith[lensReport](ctx, c.model.Model, sys, msgs) + lr := LensResult{Model: c.model.Name, Lens: c.lens.Name, Findings: rep.Findings, Err: err} + // Return the value either way (err embedded) so every cell reports. + return lr, nil + }) + out := make([]LensResult, 0, len(results)) + for _, r := range results { + if r.Err != nil { // a swarm-level error (ctx cancel) with no value + out = append(out, LensResult{Err: r.Err}) + continue + } + out = append(out, r.Value) + } + return out +} + +// cell is one (model, lens) swarm task. +type cell struct { + model NamedModel + lens Lens +} + +// Consolidate renders the swarm's results into one report: a section per model, +// each led by that model's worst finding severity, mirroring gadfly's +// one-comment-per-model output. +func Consolidate(results []LensResult) string { + byModel := map[string][]LensResult{} + var order []string + for _, r := range results { + if r.Model == "" { + continue + } + if _, ok := byModel[r.Model]; !ok { + order = append(order, r.Model) + } + byModel[r.Model] = append(byModel[r.Model], r) + } + sort.Strings(order) + + var b strings.Builder + for _, m := range order { + rs := byModel[m] + var all []Finding + worst := -1 + errored := 0 + for _, r := range rs { + if r.Err != nil { + errored++ + continue + } + all = append(all, r.Findings...) + for _, f := range r.Findings { + if severityRank(f.Severity) > worst { + worst = severityRank(f.Severity) + } + } + } + verdict := "no issues found" + if worst >= severityRank(SevHigh) { + verdict = "blocking issues found" + } else if worst >= 0 { + verdict = "minor issues" + } + fmt.Fprintf(&b, "## %s — %s", m, verdict) + if errored > 0 { + fmt.Fprintf(&b, " (⚠ %d lens(es) errored)", errored) + } + b.WriteString("\n") + sort.SliceStable(all, func(i, j int) bool { + return severityRank(all[i].Severity) > severityRank(all[j].Severity) + }) + for _, f := range all { + fmt.Fprintf(&b, "- [%s] %s — %s\n", f.Severity, f.Title, f.Detail) + } + b.WriteString("\n") + } + return b.String() +} diff --git a/examples/reviewer/reviewer_test.go b/examples/reviewer/reviewer_test.go new file mode 100644 index 0000000..c8adf28 --- /dev/null +++ b/examples/reviewer/reviewer_test.go @@ -0,0 +1,102 @@ +package main + +import ( + "context" + "strings" + "testing" + + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" + + "gitea.stevedudenhoeffer.com/steve/executus/fanout" +) + +// TestReviewSwarm proves the light-tier path end-to-end against the fake +// provider: a 2-model × 3-lens swarm runs, structured findings parse, and +// consolidation produces one verdict-led section per model — no batteries, no +// network. +func TestReviewSwarm(t *testing.T) { + fp := fake.New("fakeprov") + + // Model "hot" reports a high-severity finding on every lens; "cold" reports + // nothing. Each model is called once per lens (3×), so enqueue 3 each. + hot := `{"findings":[{"severity":"high","title":"SQL injection","detail":"unsanitized id in query"}]}` + cold := `{"findings":[]}` + for i := 0; i < 3; i++ { + fp.Enqueue("hot", fake.Reply(hot)) + fp.Enqueue("cold", fake.Reply(cold)) + } + hotM, err := fp.Model("hot") + if err != nil { + t.Fatal(err) + } + coldM, err := fp.Model("cold") + if err != nil { + t.Fatal(err) + } + + models := []NamedModel{ + {Name: "hot", Provider: "fakeprov", Model: hotM}, + {Name: "cold", Provider: "fakeprov", Model: coldM}, + } + lenses := []Lens{{Name: "security"}, {Name: "correctness"}, {Name: "error-handling"}} + + results := Review(context.Background(), models, lenses, "some diff", + fanout.Options[cell]{MaxConcurrent: 6, PerKey: map[string]int{"fakeprov": 3}}) + + // 2 models × 3 lenses = 6 cells, all successful. + if len(results) != 6 { + t.Fatalf("got %d cells, want 6", len(results)) + } + var hotFindings, coldFindings, errs int + for _, r := range results { + if r.Err != nil { + errs++ + continue + } + switch r.Model { + case "hot": + hotFindings += len(r.Findings) + case "cold": + coldFindings += len(r.Findings) + } + } + if errs != 0 { + t.Errorf("expected no cell errors, got %d", errs) + } + if hotFindings != 3 { // one per lens + t.Errorf("hot model findings = %d, want 3", hotFindings) + } + if coldFindings != 0 { + t.Errorf("cold model findings = %d, want 0", coldFindings) + } + + report := Consolidate(results) + if !strings.Contains(report, "hot — blocking issues found") { + t.Errorf("hot section should lead with a blocking verdict:\n%s", report) + } + if !strings.Contains(report, "cold — no issues found") { + t.Errorf("cold section should report no issues:\n%s", report) + } + if !strings.Contains(report, "SQL injection") { + t.Errorf("report should surface the finding:\n%s", report) + } +} + +// TestConsolidateVerdicts checks the worst-severity-led header logic. +func TestConsolidateVerdicts(t *testing.T) { + got := Consolidate([]LensResult{ + {Model: "m", Lens: "a", Findings: []Finding{{Severity: SevSmall, Title: "x"}}}, + {Model: "m", Lens: "b", Findings: []Finding{{Severity: SevMedium, Title: "y"}}}, + }) + if !strings.Contains(got, "m — minor issues") { + t.Errorf("medium-max should be 'minor issues', got:\n%s", got) + } + // An errored lens is surfaced in the header. + got = Consolidate([]LensResult{ + {Model: "m", Lens: "a", Findings: []Finding{{Severity: SevCritical, Title: "boom"}}}, + {Model: "m", Lens: "b", Err: context.DeadlineExceeded}, + }) + if !strings.Contains(got, "blocking issues found") || !strings.Contains(got, "errored") { + t.Errorf("critical + errored lens header wrong:\n%s", got) + } +}