diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml index 09c7378..7d1c4e3 100644 --- a/.gitea/workflows/adversarial-review.yml +++ b/.gitea/workflows/adversarial-review.yml @@ -45,7 +45,7 @@ jobs: # every PR with the 3-lens suite — the slow local lane dominates wall time. timeout-minutes: 90 steps: - - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-d7f364d + - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-c3d09d3 env: GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} diff --git a/CLAUDE.md b/CLAUDE.md index 8d49c16..b689971 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,6 +30,7 @@ verifies each one against the actual code, and posts its findings as a comment. ``` cmd/gadfly/ the reviewer binary — pure producer of review markdown (stdout) main.go orchestration: loop specialists, each a review pass + adversarial recheck + engine.go reviewEngine abstraction: majordomo agent loop vs claude-code CLI shell-out specialists.go specialist lenses: built-ins, default suite, env + .gadfly.yml resolution auto.go dynamic `auto` selection: a selector model picks lenses per-diff (may invent) delegate.go worker-tier delegate_investigation tool (cheap sub-agent does legwork) diff --git a/Dockerfile b/Dockerfile index b2ae511..cc7f356 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,12 @@ RUN --mount=type=cache,target=/go/pkg/mod \ go build -trimpath -ldflags="-s -w" -o /out/gadfly ./cmd/gadfly FROM alpine:3.20 -RUN apk add --no-cache bash git curl jq ca-certificates +RUN apk add --no-cache bash git curl jq ca-certificates nodejs npm +# Bundle the Claude Code CLI so the `claude-code` review engine works out of the +# box (GADFLY_MODELS=claude-code or claude-code/). This adds Node + the +# CLI to the image (notably larger); ollama-only users pay the size but nothing +# else. Auth is provided at runtime via CLAUDE_CODE_OAUTH_TOKEN / ANTHROPIC_API_KEY. +RUN npm install -g @anthropic-ai/claude-code && npm cache clean --force COPY --from=build /out/gadfly /usr/local/bin/gadfly COPY scripts /app/scripts COPY entrypoint.sh /entrypoint.sh diff --git a/README.md b/README.md index 78ffaba..bb07973 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,39 @@ majordomo failover chain / alias) is used verbatim. > and exercise the exact same code an OpenAI/OpenRouter endpoint would hit, for free. If you > try a cloud provider and it works (or doesn't), please open an issue. +### Claude Code engine (`claude-code`) + +Besides the majordomo model loop, Gadfly can review through the **[Claude Code](https://claude.com/claude-code) +CLI**: for each lens it shells out to `claude -p` *inside the checked-out repo*, so Claude Code +uses its **own** read tools (Read/Grep/Glob) to verify findings against real code, then Gadfly +parses the result and runs the same verdict-parse → recheck → consolidate → emit pipeline. The +CLI is bundled in the image (Node + `@anthropic-ai/claude-code`). + +Select it as a model id — bare `claude-code` (CLI default model) or `claude-code/` (the +suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`): + +```yaml +GADFLY_MODELS: "claude-code/sonnet,claude-code/opus" +``` + +Auth is read from the environment: the default is a **Pro/Max subscription** via +`CLAUDE_CODE_OAUTH_TOKEN` (from `claude setup-token`; no `--bare`), falling back to +`ANTHROPIC_API_KEY`. Don't set both. Tuning knobs (all optional): + +| Env | Default | Meaning | +|-----|---------|---------| +| `GADFLY_CLAUDE_MODEL` | *(from the spec suffix)* | overrides the `--model` value | +| `GADFLY_CLAUDE_PERMISSION_MODE` | `plan` | `--permission-mode` (read-only `plan` keeps it from editing) | +| `GADFLY_CLAUDE_ALLOWED_TOOLS` | *(unset)* | `--allowedTools` value, passed verbatim (e.g. `Read,Grep,Glob`) | +| `GADFLY_CLAUDE_EXTRA_ARGS` | *(unset)* | extra CLI args appended verbatim (e.g. `--max-turns 30`) | +| `GADFLY_CLAUDE_BIN` | `claude` | CLI binary path | + +> **Untested, like the cloud providers.** This wires the CLI in and is exercised by its unit +> tests, but a live subscription-auth run hasn't been validated end-to-end here — and using +> subscription auth in automated CI is a gray area in Anthropic's terms. `auto` specialist +> selection and the `delegate_investigation` worker are majordomo-only and are skipped with this +> engine (Claude Code does its own legwork). + ### Endpoint aliases via env vars For multiple named backends (e.g. a couple of Ollama boxes on your LAN), register them by @@ -264,6 +297,7 @@ The reviewer binary reads these (the stub/entrypoint set sane defaults): | `GADFLY_PROVIDER` | `ollama-cloud` | provider prefix for a bare model id | | `GADFLY_BASE_URL` | — | override endpoint (OpenAI/Ollama-compatible servers) | | `GADFLY_API_KEY` | — | provider key; falls back to the provider's standard env | +| `claude-code` model id | — | route a model through the bundled Claude Code CLI (`claude-code` / `claude-code/`); see [Claude Code engine](#claude-code-engine-claude-code) for its `GADFLY_CLAUDE_*` knobs | | `GADFLY_SPECIALISTS` | default suite | csv of lenses, `all`, or `auto` (dynamic selection) | | `GADFLY_SELECTOR_MODEL` | review model | model that picks lenses in `auto` mode | | `GADFLY_WORKER_MODEL` | — | cheap model for `delegate_investigation`; unset = no delegation | diff --git a/cmd/gadfly/engine.go b/cmd/gadfly/engine.go new file mode 100644 index 0000000..160a9d7 --- /dev/null +++ b/cmd/gadfly/engine.go @@ -0,0 +1,165 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "strings" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" +) + +// reviewEngine runs a single agent pass against the checked-out repo and returns +// the model's text answer. It is the one primitive both review passes use — the +// draft review and the adversarial recheck — so the rest of the pipeline +// (specialist composition, recheck orchestration, consolidation, emit) is +// engine-agnostic. Two implementations: +// +// - majordomoEngine: the original path — a majordomo tool-using agent loop +// (read_file/grep/… over a sandboxed repoFS). +// - claudeCodeEngine: shells out to the `claude` CLI in print mode, which +// brings its OWN repo tools; gadfly just feeds it the prompt and reads back +// the final text. +// +// maxSteps is the tool-step budget for engines that have one (majordomo); the +// claude-code engine manages its own loop and ignores it. +type reviewEngine interface { + runPass(ctx context.Context, system, task string, maxSteps int) (string, error) +} + +// majordomoEngine drives the in-process majordomo agent over the repo sandbox. +type majordomoEngine struct { + mdl llm.Model + fsTools *repoFS +} + +func (e *majordomoEngine) runPass(ctx context.Context, system, task string, maxSteps int) (string, error) { + return runAgent(ctx, e.mdl, e.fsTools, system, task, maxSteps) +} + +// claudeCodeEngine reviews by shelling out to the `claude` CLI (Claude Code) in +// non-interactive print mode. Claude Code reads the checked-out tree with its +// own read tools (so it verifies findings against real code, like the agentic +// majordomo path), and we parse its final answer out of `--output-format json`. +// +// Auth is inherited from the environment: the default backend is a Pro/Max +// subscription via CLAUDE_CODE_OAUTH_TOKEN (no `--bare`). See README. +type claudeCodeEngine struct { + bin string // CLI binary (GADFLY_CLAUDE_BIN, default "claude") + model string // --model value ("" = CLI default) + repoDir string // cwd for the CLI, so its tools read the checked-out tree + permissionMode string // --permission-mode (default "plan": read-only, no edits) + allowedTools string // --allowedTools value, passed verbatim ("" = omit) + extraArgs []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS) +} + +// isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code +// engine: the bare id "claude-code" or a "claude-code/" form. +func isClaudeCodeSpec(model string) bool { + m := strings.TrimSpace(model) + return m == "claude-code" || strings.HasPrefix(m, "claude-code/") +} + +// newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the +// optional GADFLY_CLAUDE_* overrides. The model after the slash in +// "claude-code/" becomes --model (e.g. "claude-code/sonnet" → "sonnet"); +// GADFLY_CLAUDE_MODEL overrides it. It does not verify the CLI is installed — +// a missing binary surfaces as a normal pass error (advisory, never fatal). +func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine { + model := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")) + if model == "" { + if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok { + model = strings.TrimSpace(after) + } + } + return &claudeCodeEngine{ + bin: envOr("GADFLY_CLAUDE_BIN", "claude"), + model: model, + repoDir: repoDir, + permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"), + allowedTools: strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")), + extraArgs: strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")), + } +} + +// args assembles the `claude` argv for one pass. Factored out (and pure) so it +// can be unit-tested without invoking the CLI. The system prompt is layered on +// top of Claude Code's own via --append-system-prompt; the task is the -p +// prompt. +func (e *claudeCodeEngine) args(system, task string) []string { + a := []string{"-p", task, "--output-format", "json", "--append-system-prompt", system} + if e.model != "" { + a = append(a, "--model", e.model) + } + if e.permissionMode != "" { + a = append(a, "--permission-mode", e.permissionMode) + } + if e.allowedTools != "" { + a = append(a, "--allowedTools", e.allowedTools) + } + return append(a, e.extraArgs...) +} + +// claudeResult is the subset of `claude --output-format json` we read. +type claudeResult struct { + Result string `json:"result"` + IsError bool `json:"is_error"` + Subtype string `json:"subtype"` +} + +func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ int) (string, error) { + cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...) + cmd.Dir = e.repoDir + cmd.Env = os.Environ() // inherits CLAUDE_CODE_OAUTH_TOKEN / ANTHROPIC_API_KEY + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + runErr := cmd.Run() + + // Prefer a structured answer; the CLI prints JSON even on some failures. + var res claudeResult + if jerr := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &res); jerr == nil { + if res.IsError { + return "", fmt.Errorf("claude reported error (%s): %s", res.Subtype, truncateForErr(res.Result)) + } + if out := strings.TrimSpace(res.Result); out != "" { + return out, nil + } + } + + if runErr != nil { + detail := truncateForErr(stderr.String()) + if detail == "" { + detail = truncateForErr(stdout.String()) + } + return "", fmt.Errorf("claude -p failed: %v: %s", runErr, detail) + } + // Ran cleanly but we couldn't pull a result out of the JSON: fall back to + // raw stdout so a format change degrades to "use the text" instead of empty. + if raw := strings.TrimSpace(stdout.String()); raw != "" { + return raw, nil + } + return "", fmt.Errorf("claude -p produced no parseable output") +} + +// truncateForErr caps CLI error detail so a stderr dump can't bloat the comment. +func truncateForErr(s string) string { + s = strings.TrimSpace(s) + const max = 800 + if len(s) > max { + return s[:max] + "…" + } + return s +} + +// envOr returns the env var value or a default when unset/blank. +func envOr(name, def string) string { + if v := strings.TrimSpace(os.Getenv(name)); v != "" { + return v + } + return def +} diff --git a/cmd/gadfly/engine_test.go b/cmd/gadfly/engine_test.go new file mode 100644 index 0000000..69dce18 --- /dev/null +++ b/cmd/gadfly/engine_test.go @@ -0,0 +1,115 @@ +package main + +import ( + "slices" + "strings" + "testing" +) + +func TestIsClaudeCodeSpec(t *testing.T) { + cases := map[string]bool{ + "claude-code": true, + "claude-code/sonnet": true, + "claude-code/opus": true, + "claude-code/claude-opus-4-8": true, + " claude-code ": true, // trimmed + "qwen3-coder:480b-cloud": false, + "claude-code-extra": false, // not the bare id, not a "/" form + "sonnet": false, + "": false, + } + for spec, want := range cases { + if got := isClaudeCodeSpec(spec); got != want { + t.Errorf("isClaudeCodeSpec(%q) = %v, want %v", spec, got, want) + } + } +} + +func TestNewClaudeCodeEngineModel(t *testing.T) { + // model derived from the spec's "/" suffix + t.Setenv("GADFLY_CLAUDE_MODEL", "") + if e := newClaudeCodeEngine("claude-code/sonnet", "/repo"); e.model != "sonnet" { + t.Errorf("model = %q, want sonnet", e.model) + } + // bare spec → CLI default (no --model) + if e := newClaudeCodeEngine("claude-code", "/repo"); e.model != "" { + t.Errorf("model = %q, want empty for bare spec", e.model) + } + // GADFLY_CLAUDE_MODEL overrides the spec suffix + t.Setenv("GADFLY_CLAUDE_MODEL", "opus") + if e := newClaudeCodeEngine("claude-code/sonnet", "/repo"); e.model != "opus" { + t.Errorf("model = %q, want opus (env override)", e.model) + } +} + +func TestClaudeCodeEngineDefaults(t *testing.T) { + t.Setenv("GADFLY_CLAUDE_BIN", "") + t.Setenv("GADFLY_CLAUDE_PERMISSION_MODE", "") + t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "") + t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "") + e := newClaudeCodeEngine("claude-code", "/repo") + if e.bin != "claude" { + t.Errorf("bin = %q, want claude", e.bin) + } + if e.permissionMode != "plan" { + t.Errorf("permissionMode = %q, want plan", e.permissionMode) + } + if e.repoDir != "/repo" { + t.Errorf("repoDir = %q, want /repo", e.repoDir) + } +} + +// argAfter returns the value following flag in args, or "" if absent. +func argAfter(args []string, flag string) string { + if i := slices.Index(args, flag); i >= 0 && i+1 < len(args) { + return args[i+1] + } + return "" +} + +func TestClaudeCodeArgs(t *testing.T) { + t.Setenv("GADFLY_CLAUDE_MODEL", "") + t.Setenv("GADFLY_CLAUDE_PERMISSION_MODE", "") + t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "Read,Grep,Glob") + t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "--max-turns 30") + e := newClaudeCodeEngine("claude-code/sonnet", "/repo") + args := e.args("SYS-PROMPT", "TASK-PROMPT") + + // task is the -p value; json output; system appended; model + policy present. + if argAfter(args, "-p") != "TASK-PROMPT" { + t.Errorf("-p = %q, want TASK-PROMPT", argAfter(args, "-p")) + } + if argAfter(args, "--output-format") != "json" { + t.Errorf("--output-format = %q, want json", argAfter(args, "--output-format")) + } + if argAfter(args, "--append-system-prompt") != "SYS-PROMPT" { + t.Errorf("--append-system-prompt = %q, want SYS-PROMPT", argAfter(args, "--append-system-prompt")) + } + if argAfter(args, "--model") != "sonnet" { + t.Errorf("--model = %q, want sonnet", argAfter(args, "--model")) + } + if argAfter(args, "--permission-mode") != "plan" { + t.Errorf("--permission-mode = %q, want plan", argAfter(args, "--permission-mode")) + } + if argAfter(args, "--allowedTools") != "Read,Grep,Glob" { + t.Errorf("--allowedTools = %q, want Read,Grep,Glob", argAfter(args, "--allowedTools")) + } + // extra args appended verbatim (split on whitespace) + if !strings.Contains(strings.Join(args, " "), "--max-turns 30") { + t.Errorf("extra args not appended: %v", args) + } +} + +func TestClaudeCodeArgsBareModelOmitsFlag(t *testing.T) { + t.Setenv("GADFLY_CLAUDE_MODEL", "") + t.Setenv("GADFLY_CLAUDE_ALLOWED_TOOLS", "") // omit when blank + t.Setenv("GADFLY_CLAUDE_EXTRA_ARGS", "") + e := newClaudeCodeEngine("claude-code", "/repo") + args := e.args("s", "t") + if slices.Contains(args, "--model") { + t.Errorf("--model should be omitted for a bare claude-code spec: %v", args) + } + if slices.Contains(args, "--allowedTools") { + t.Errorf("--allowedTools should be omitted when blank: %v", args) + } +} diff --git a/cmd/gadfly/lens_concurrency_test.go b/cmd/gadfly/lens_concurrency_test.go index 2cfc577..f91c008 100644 --- a/cmd/gadfly/lens_concurrency_test.go +++ b/cmd/gadfly/lens_concurrency_test.go @@ -104,7 +104,7 @@ func TestRunSpecialists_FansOut(t *testing.T) { } specs := threeLenses() - results := runSpecialists(mdl, fs, "sys", specs, "task", "diff") + results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff") if got := peak(); got != 3 { t.Errorf("peak concurrent lenses = %d, want 3", got) @@ -124,7 +124,7 @@ func TestRunSpecialists_SequentialByDefault(t *testing.T) { } specs := threeLenses() - results := runSpecialists(mdl, fs, "sys", specs, "task", "diff") + results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff") if got := peak(); got != 1 { t.Errorf("peak concurrent lenses = %d, want 1 (sequential by default)", got) @@ -146,7 +146,7 @@ func TestRunSpecialists_PerProviderFanOut(t *testing.T) { } specs := threeLenses() - results := runSpecialists(mdl, fs, "sys", specs, "task", "diff") + results := runSpecialists(&majordomoEngine{mdl: mdl, fsTools: fs}, "sys", specs, "task", "diff") if got := peak(); got != 3 { t.Errorf("peak concurrent lenses = %d, want 3 (m1 per-provider override)", got) diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index 3718c56..6f29944 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -149,17 +149,27 @@ func run() error { return err } - mdl, err := resolveModel() - if err != nil { - return fmt.Errorf("resolve model: %w", err) - } - - // Optional cheap worker for delegate_investigation. Non-fatal: a bad worker - // spec just disables delegation rather than sinking the review. - if worker, werr := resolveWorkerModel(); werr != nil { - fmt.Fprintln(os.Stderr, "gadfly: worker model disabled:", werr) - } else if worker != nil { - fsTools.worker = worker + // Resolve the review engine. The claude-code engine shells out to the + // `claude` CLI (its own repo tools); every other spec is a majordomo model. + // auto-selection and the delegate worker are majordomo-only — with + // claude-code they're skipped (Claude Code does its own legwork). + ccSpec := isClaudeCodeSpec(os.Getenv("GADFLY_MODEL")) + var eng reviewEngine + if ccSpec { + eng = newClaudeCodeEngine(os.Getenv("GADFLY_MODEL"), fsTools.root) + } else { + mdl, merr := resolveModel() + if merr != nil { + return fmt.Errorf("resolve model: %w", merr) + } + // Optional cheap worker for delegate_investigation. Non-fatal: a bad + // worker spec just disables delegation rather than sinking the review. + if worker, werr := resolveWorkerModel(); werr != nil { + fmt.Fprintln(os.Stderr, "gadfly: worker model disabled:", werr) + } else if worker != nil { + fsTools.worker = worker + } + eng = &majordomoEngine{mdl: mdl, fsTools: fsTools} } specialists, registry, auto, serrs := resolveSpecialists(repoDir) @@ -168,20 +178,26 @@ func run() error { } // Dynamic selection: a (cheap) model picks the lenses this diff needs. + // Majordomo-only — the selector is an llm.Model. if auto { - selector, serr := resolveSelectorModel(mdl) - if serr != nil { - return fmt.Errorf("resolve selector model: %w", serr) - } - selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout) - picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry) - cancel() - if aerr != nil { - fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr) + if ccSpec { + fmt.Fprintln(os.Stderr, "gadfly: auto-select is not supported with the claude-code engine; using the default suite") specialists = suiteFromRegistry(registry, defaultSuite) } else { - specialists = picked - fmt.Fprintln(os.Stderr, "gadfly: auto-selected specialists:", specialistNamesOf(specialists)) + selector, serr := resolveSelectorModel(eng.(*majordomoEngine).mdl) + if serr != nil { + return fmt.Errorf("resolve selector model: %w", serr) + } + selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout) + picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry) + cancel() + if aerr != nil { + fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr) + specialists = suiteFromRegistry(registry, defaultSuite) + } else { + specialists = picked + fmt.Fprintln(os.Stderr, "gadfly: auto-selected specialists:", specialistNamesOf(specialists)) + } } } @@ -191,7 +207,7 @@ func run() error { base := string(systemBytes) task := buildTask(diff) - results := runSpecialists(mdl, fsTools, base, specialists, task, diff) + results := runSpecialists(eng, base, specialists, task, diff) fmt.Println(renderConsolidated(results)) @@ -215,7 +231,7 @@ func run() error { // per-provider model concurrency, so total concurrent backend requests ≈ // (models at once) × (lenses at once). To fan lenses out without oversubscribing // the backend, run models one at a time (provider lane cap 1) and raise this. -func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []Specialist, task, diff string) []specialistResult { +func runSpecialists(eng reviewEngine, base string, specialists []Specialist, task, diff string) []specialistResult { results := make([]specialistResult, len(specialists)) // Optional live status board: publishes this model's per-lens progress to a @@ -244,7 +260,7 @@ func runSpecialists(mdl llm.Model, fsTools *repoFS, base string, specialists []S } }() sw.set(sp.Name, lensRunning, "", false) - out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff) + out, errored := reviewWithSpecialist(eng, base, sp, task, diff) v := parseVerdict(out) results[i] = specialistResult{spec: sp, out: out, verdict: v, errored: errored} sw.set(sp.Name, lensFinished, v.label(), errored) @@ -290,12 +306,12 @@ func providerOverride(envName, provider string) (int, bool) { // specialist's composed prompt, then the shared adversarial recheck pass. The // returned bool is true when the review pass failed (rendered as an inline // notice — advisory; one lens failing never sinks the others or the job). -func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) (string, bool) { +func reviewWithSpecialist(eng reviewEngine, base string, sp Specialist, task, diff string) (string, bool) { timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - draft, err := runAgent(ctx, mdl, fsTools, composeSpecialistPrompt(base, sp), task, + draft, err := eng.runPass(ctx, composeSpecialistPrompt(base, sp), task, envInt("GADFLY_MAX_STEPS", defaultMaxSteps)) if err != nil { fmt.Fprintf(os.Stderr, "gadfly: specialist %q review pass failed: %v\n", sp.Name, err) @@ -304,7 +320,7 @@ func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specia final := draft if shouldRecheck(draft) { - rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff), + rechecked, rerr := eng.runPass(ctx, recheckSystemPrompt, buildRecheckTask(draft, diff), envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps)) if rerr != nil { fmt.Fprintf(os.Stderr, "gadfly: specialist %q recheck failed; emitting unverified draft: %v\n", sp.Name, rerr) diff --git a/entrypoint.sh b/entrypoint.sh index 23649b3..e15972c 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -36,6 +36,9 @@ # e.g. "ollama" local, "openai", "anthropic", "google") # GADFLY_BASE_URL override backend endpoint (OpenAI/Ollama-compatible servers) # GADFLY_API_KEY provider key (else provider's standard env: OPENAI_API_KEY, …) +# CLAUDE_CODE_OAUTH_TOKEN auth for the claude-code engine (GADFLY_MODELS entry +# "claude-code"/"claude-code/"); Pro/Max subscription +# token from `claude setup-token`. Else ANTHROPIC_API_KEY. # GADFLY_TRIGGER_PHRASE comment phrase that triggers a re-review (default "@gadfly review") # GADFLY_ALLOWED_USERS comma-separated usernames allowed to comment-trigger; # empty => fall back to "is a repo collaborator" diff --git a/examples/README.md b/examples/README.md index 4879543..55589d5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,6 +10,7 @@ set the secrets/vars it references. Gadfly is advisory only — it never blocks | [`local-ollama.yml`](local-ollama.yml) | a **local/LAN Ollama** daemon | nothing (or `GADFLY_BASE_URL` for a remote host) | | [`openai-compatible.yml`](openai-compatible.yml) | any **OpenAI-compatible** endpoint (local Ollama `/v1`, gateway, vLLM, OpenRouter…) | `GADFLY_BASE_URL` (+ a key for most gateways) | | [`endpoint-aliases.yml`](endpoint-aliases.yml) | **several named backends** at once (one comment each) | repo vars `GADFLY_ENDPOINT_` | +| [`claude-code.yml`](claude-code.yml) | the bundled **Claude Code CLI** engine (`claude-code/`) | secret `CLAUDE_CODE_OAUTH_TOKEN` (or `ANTHROPIC_API_KEY`) | | [`.gadfly.yml`](.gadfly.yml) | **per-repo specialist config** (not a workflow — goes at your repo root) | — | Common to all: diff --git a/examples/claude-code.yml b/examples/claude-code.yml new file mode 100644 index 0000000..6cf965a --- /dev/null +++ b/examples/claude-code.yml @@ -0,0 +1,71 @@ +# Gadfly reviewing via the Claude Code CLI engine. +# Copy to .gitea/workflows/adversarial-review.yml in your repo. +# +# Instead of a majordomo model, each lens shells out to the bundled `claude` CLI +# inside the checked-out repo (it uses its own Read/Grep/Glob tools to verify +# findings), then Gadfly runs its usual verdict + recheck + consolidate pipeline. +# +# Auth: a Pro/Max subscription token from `claude setup-token` (no --bare), +# stored as the CLAUDE_CODE_OAUTH_TOKEN secret. Falls back to ANTHROPIC_API_KEY +# if you'd rather pay per-token — set only ONE. +# +# Heads-up: this engine is wired but not yet validated end-to-end here, and using +# subscription auth in automated CI is a gray area in Anthropic's terms — read +# the README's "Claude Code engine" note before relying on it. + +name: Adversarial Review (Gadfly) + +on: + pull_request: + types: [opened, reopened, ready_for_review] + issue_comment: + types: [created] + workflow_dispatch: + inputs: + pr_number: { description: "PR number to review", required: true } + +permissions: + contents: read + issues: write + pull-requests: write + +concurrency: + group: gadfly-${{ github.event.issue.number || github.event.pull_request.number || github.event.inputs.pr_number }} + cancel-in-progress: true + +jobs: + review: + # Security: only trusted users may trigger a secret-bearing run via a PR + # comment. Replace the username(s) below with your maintainers — keep them in + # sync with GADFLY_ALLOWED_USERS (the in-container belt-and-suspenders check). + if: >- + github.event_name != 'issue_comment' + || github.actor == 'your-username' + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:latest + env: + GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} + GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + # --- Claude Code engine --- + # Pro/Max subscription token (preferred). Or set ANTHROPIC_API_KEY + # instead for per-token billing — but never both. + CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + # ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + # bare "claude-code" uses the CLI default model; "claude-code/" + # sets --model (sonnet/opus/haiku, or a full id). One comment per entry. + GADFLY_MODELS: "claude-code/sonnet" + # Optional CLI tuning (defaults are read-only-safe): + # GADFLY_CLAUDE_PERMISSION_MODE: plan # read-only; never edits + # GADFLY_CLAUDE_ALLOWED_TOOLS: "Read,Grep,Glob" + # GADFLY_CLAUDE_EXTRA_ARGS: "--max-turns 30" + GADFLY_ALLOWED_USERS: "your-username" + # --- event context (leave as-is) --- + EVENT_NAME: ${{ github.event_name }} + PR: ${{ github.event.pull_request.number || github.event.issue.number || github.event.inputs.pr_number }} + PR_BRANCH: ${{ github.head_ref }} + IS_DRAFT: ${{ github.event.pull_request.draft }} + COMMENT_BODY: ${{ github.event.comment.body }} + COMMENT_ID: ${{ github.event.comment.id }} + ACTOR: ${{ github.actor }} diff --git a/scripts/run.sh b/scripts/run.sh index 0394f9e..8e9b396 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -23,6 +23,12 @@ # GADFLY_REPO_DIR (checked-out repo; default: this script's repo) # antigravity: `agy` on PATH with credentials already seeded (~/.gemini) # +# claude-code engine: when MODEL is "claude-code" or "claude-code/" the +# binary shells out to the bundled `claude` CLI instead of a majordomo model. +# Its auth (CLAUDE_CODE_OAUTH_TOKEN, else ANTHROPIC_API_KEY) and GADFLY_CLAUDE_* +# tuning are read straight from the inherited environment — same as the other +# provider keys (OPENAI_API_KEY, …) — so no extra wiring is needed here. +# # Optional: # MAX_DIFF_CHARS diff truncation cap for the prompt (default 60000) # GADFLY_STATUS_FILE per-model JSON path for the live status board (set by