From c342bdb9052780061125b4cc0a8b4c5fd110dce9 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 27 Jun 2026 22:39:14 +0000 Subject: [PATCH] feat: add claude-code/opus reviewer + max-thinking spec support (#5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds claude-code/opus to gadfly's dogfood swarm (both sonnet and opus run end-to-end), bumps the image pin to :sha-80d8f53 so the clean-lens telemetry fix is live, and adds engine support for a "claude-code/:max" extended-thinking spec (MAX_THINKING_TOKENS, best-effort). Validated: only 13 findings on this clean PR vs 43 on the comparable #4 — the telemetry fix works. Folded in the swarm's two real findings: a runPass env-injection test and keeping MAX_THINKING_TOKENS in claudeEnv. Follow-up enables claude-code/opus:max once this image builds. Co-Authored-By: Claude Opus 4.8 (1M context) Co-authored-by: Steve Dudenhoeffer Co-committed-by: Steve Dudenhoeffer --- .gitea/workflows/adversarial-review.yml | 18 +++---- README.md | 10 +++- cmd/gadfly/engine.go | 55 ++++++++++++++++++---- cmd/gadfly/engine_test.go | 62 +++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 20 deletions(-) diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml index b58834e..6b913d9 100644 --- a/.gitea/workflows/adversarial-review.yml +++ b/.gitea/workflows/adversarial-review.yml @@ -45,7 +45,7 @@ jobs: # with the 3-lens suite. All cloud now, so runs are fast. timeout-minutes: 90 steps: - - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-86f12c1 + - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-80d8f53 env: GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} @@ -54,14 +54,14 @@ jobs: # below): Pro/Max subscription token. Dogfoods the Phase-1 engine on # gadfly's own PRs as a competitor alongside the Ollama models. CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - # Fleet: 6 cloud (3 at a time) + Claude Code (sonnet) — one consolidated - # comment each, all cloud now. The local Macs (m1/m5) and the weaker - # cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were dropped as - # low-signal for gadfly's own PRs. claude-code/sonnet runs the Phase-1 - # engine as a competitor in its own lane (needs CLAUDE_CODE_OAUTH_TOKEN). - GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet" - # cloud runs 3 at once; claude-code one at a time; both lanes parallel. - GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=1" + # Fleet: 6 cloud (3 at a time) + Claude Code (sonnet + opus) — one + # consolidated comment each, all cloud now. The local Macs (m1/m5) and + # the weaker cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were + # dropped as low-signal. The claude-code/* entries run the Phase-1 + # engine as competitors in their own lane (need CLAUDE_CODE_OAUTH_TOKEN). + GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet,claude-code/opus" + # cloud runs 3 at once; claude-code 2 at a time; both lanes parallel. + GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=2" # 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget). GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3" # Default => the 3-lens suite (security, correctness, error-handling). diff --git a/README.md b/README.md index 77870e1..348c222 100644 --- a/README.md +++ b/README.md @@ -89,12 +89,18 @@ parses the result and runs the same verdict-parse → recheck → consolidate CLI is bundled in the image (Node + `@anthropic-ai/claude-code`). Select it as a model id — bare `claude-code` (CLI default model) or `claude-code/` (the -suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`): +suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`). An optional +`:` suffix forces an extended-thinking budget for that reviewer — `:max` (the high +"ultrathink" tier) or `:` for a specific token budget — so you can run the same model at two +thinking depths as separate reviewers: ```yaml -GADFLY_MODELS: "claude-code/sonnet,claude-code/opus" +GADFLY_MODELS: "claude-code/sonnet,claude-code/opus,claude-code/opus:max" ``` +The thinking budget is applied via the `MAX_THINKING_TOKENS` env on the CLI subprocess; it's +best-effort (a no-op if the installed CLI build doesn't honor it). + Auth is read from the environment: the default is a **Pro/Max subscription** via `CLAUDE_CODE_OAUTH_TOKEN` (from `claude setup-token`; no `--bare`), falling back to `ANTHROPIC_API_KEY`. Don't set both. Tuning knobs (all optional): diff --git a/cmd/gadfly/engine.go b/cmd/gadfly/engine.go index 41aa7c8..7fe4c09 100644 --- a/cmd/gadfly/engine.go +++ b/cmd/gadfly/engine.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "os/exec" + "strconv" "strings" "syscall" "unicode/utf8" @@ -56,8 +57,14 @@ type claudeCodeEngine struct { permissionMode string // --permission-mode (default "plan": read-only, no edits) allowedTools string // --allowedTools value, passed verbatim ("" = omit) extraArgs []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS) + thinkingTokens int // MAX_THINKING_TOKENS for the subprocess; 0 = leave default } +// maxThinkingTokens is the extended-thinking budget used for a "claude-code/:max" +// spec — Claude Code's high "ultrathink" tier. Set as MAX_THINKING_TOKENS on the +// subprocess; harmless (a no-op) if the CLI build doesn't honor it. +const maxThinkingTokens = 31999 + // isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code // engine: the bare id "claude-code" or a "claude-code/" form. func isClaudeCodeSpec(model string) bool { @@ -66,17 +73,30 @@ func isClaudeCodeSpec(model string) bool { } // newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the -// optional GADFLY_CLAUDE_* overrides. The model after the slash in -// "claude-code/" becomes --model (e.g. "claude-code/sonnet" → "sonnet"); -// GADFLY_CLAUDE_MODEL overrides it. It does not verify the CLI is installed — -// a missing binary surfaces as a normal pass error (advisory, never fatal). +// optional GADFLY_CLAUDE_* overrides. The part after the slash in +// "claude-code/[:]" becomes --model (e.g. "claude-code/sonnet" +// → "sonnet"), with an optional thinking suffix: ":max" forces the high +// extended-thinking budget and ":" sets a specific MAX_THINKING_TOKENS. +// GADFLY_CLAUDE_MODEL overrides the model id (the thinking suffix still applies). +// It does not verify the CLI is installed — a missing binary surfaces as a normal +// pass error (advisory, never fatal). func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine { - model := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")) - if model == "" { - if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok { - model = strings.TrimSpace(after) + var model string + thinking := 0 + if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok { + after = strings.TrimSpace(after) + // Optional ":" suffix. Claude model aliases/ids contain no ":", + // so a colon unambiguously separates the thinking tier here. + if m, t, hasColon := strings.Cut(after, ":"); hasColon { + model = strings.TrimSpace(m) + thinking = parseThinking(strings.TrimSpace(t)) + } else { + model = after } } + if env := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")); env != "" { + model = env + } return &claudeCodeEngine{ bin: envOr("GADFLY_CLAUDE_BIN", "claude"), model: model, @@ -84,9 +104,22 @@ func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine { permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"), allowedTools: strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")), extraArgs: strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")), + thinkingTokens: thinking, } } +// parseThinking maps a spec thinking suffix to a MAX_THINKING_TOKENS budget: +// "max" → maxThinkingTokens, a positive integer → itself, anything else → 0 (off). +func parseThinking(s string) int { + if strings.EqualFold(s, "max") { + return maxThinkingTokens + } + if n, err := strconv.Atoi(s); err == nil && n > 0 { + return n + } + return 0 +} + // args assembles the `claude` argv for one pass. Factored out (and pure) so it // can be unit-tested without invoking the CLI. The system prompt is layered on // top of Claude Code's own via --append-system-prompt; the task is the -p @@ -116,6 +149,10 @@ func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ i cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...) cmd.Dir = e.repoDir cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI + if e.thinkingTokens > 0 { + // Force an extended-thinking budget for this run (a "...:max" spec). + cmd.Env = append(cmd.Env, "MAX_THINKING_TOKENS="+strconv.Itoa(e.thinkingTokens)) + } // Put the CLI and the Node children it spawns in their own process group and // kill the WHOLE group on context cancel, so a timed-out lens can't leave // orphaned claude/node processes behind in the container. @@ -184,7 +221,7 @@ func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ i func claudeEnv() []string { keep := func(k string) bool { switch k { - case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL": + case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL", "MAX_THINKING_TOKENS": return true } return strings.HasPrefix(k, "LC_") || diff --git a/cmd/gadfly/engine_test.go b/cmd/gadfly/engine_test.go index 538093d..5acc756 100644 --- a/cmd/gadfly/engine_test.go +++ b/cmd/gadfly/engine_test.go @@ -217,3 +217,65 @@ func TestRunPassNonZeroNoJSON(t *testing.T) { t.Fatalf("non-zero exit should error with detail, got %v", err) } } + +func TestClaudeCodeThinking(t *testing.T) { + t.Setenv("GADFLY_CLAUDE_MODEL", "") + cases := []struct { + spec string + wantModel string + wantThink int + }{ + {"claude-code/opus", "opus", 0}, + {"claude-code/opus:max", "opus", maxThinkingTokens}, + {"claude-code/sonnet:20000", "sonnet", 20000}, + {"claude-code/opus:bogus", "opus", 0}, // unrecognized suffix -> off + {"claude-code", "", 0}, + } + for _, c := range cases { + e := newClaudeCodeEngine(c.spec, "/repo") + if e.model != c.wantModel || e.thinkingTokens != c.wantThink { + t.Errorf("newClaudeCodeEngine(%q) = (model %q, think %d), want (%q, %d)", + c.spec, e.model, e.thinkingTokens, c.wantModel, c.wantThink) + } + } +} + +func TestClaudeCodeThinkingEnvOverrideKeepsSuffix(t *testing.T) { + // GADFLY_CLAUDE_MODEL overrides the model id, but the :max thinking from the + // spec still applies. + t.Setenv("GADFLY_CLAUDE_MODEL", "claude-opus-4-8") + e := newClaudeCodeEngine("claude-code/opus:max", "/repo") + if e.model != "claude-opus-4-8" { + t.Errorf("model = %q, want claude-opus-4-8 (env override)", e.model) + } + if e.thinkingTokens != maxThinkingTokens { + t.Errorf("thinkingTokens = %d, want %d (suffix still applies)", e.thinkingTokens, maxThinkingTokens) + } +} + +// TestRunPassInjectsThinkingTokens verifies the engine actually puts +// MAX_THINKING_TOKENS into the subprocess env for a ":max" spec (and not for a +// plain spec). The stub echoes the value it received back as the result. +func TestRunPassInjectsThinkingTokens(t *testing.T) { + t.Setenv("GADFLY_CLAUDE_MODEL", "") + t.Setenv("MAX_THINKING_TOKENS", "") // not in the test's own env + dir := t.TempDir() + stub := dir + "/claude-stub.sh" + // Report the env var the CLI was launched with. + script := "#!/bin/sh\nprintf '{\"result\":\"MTT=%s\",\"is_error\":false}' \"${MAX_THINKING_TOKENS:-unset}\"\n" + if err := os.WriteFile(stub, []byte(script), 0o755); err != nil { + t.Fatal(err) + } + + maxEng := newClaudeCodeEngine("claude-code/opus:max", dir) + maxEng.bin = stub + if out, err := maxEng.runPass(context.Background(), "s", "t", 0); err != nil || out != "MTT=31999" { + t.Fatalf(":max run: got (%q, %v), want (MTT=31999, nil)", out, err) + } + + plainEng := newClaudeCodeEngine("claude-code/opus", dir) + plainEng.bin = stub + if out, err := plainEng.runPass(context.Background(), "s", "t", 0); err != nil || out != "MTT=unset" { + t.Fatalf("plain run: got (%q, %v), want (MTT=unset, nil)", out, err) + } +}