feat: add claude-code/opus reviewer + max-thinking spec support (#5)
Build & push image / build-and-push (push) Successful in 15s

Adds claude-code/opus to gadfly's dogfood swarm (both sonnet and opus run
end-to-end), bumps the image pin to :sha-80d8f53 so the clean-lens
telemetry fix is live, and adds engine support for a
"claude-code/<model>:max" extended-thinking spec (MAX_THINKING_TOKENS,
best-effort). Validated: only 13 findings on this clean PR vs 43 on the
comparable #4 — the telemetry fix works.

Folded in the swarm's two real findings: a runPass env-injection test and
keeping MAX_THINKING_TOKENS in claudeEnv. Follow-up enables
claude-code/opus:max once this image builds.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Co-authored-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Co-committed-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
This commit was merged in pull request #5.
This commit is contained in:
2026-06-27 22:39:14 +00:00
committed by steve
parent 80d8f53f63
commit c342bdb905
4 changed files with 125 additions and 20 deletions
+9 -9
View File
@@ -45,7 +45,7 @@ jobs:
# with the 3-lens suite. All cloud now, so runs are fast. # with the 3-lens suite. All cloud now, so runs are fast.
timeout-minutes: 90 timeout-minutes: 90
steps: steps:
- uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-86f12c1 - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-80d8f53
env: env:
GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }}
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
@@ -54,14 +54,14 @@ jobs:
# below): Pro/Max subscription token. Dogfoods the Phase-1 engine on # below): Pro/Max subscription token. Dogfoods the Phase-1 engine on
# gadfly's own PRs as a competitor alongside the Ollama models. # gadfly's own PRs as a competitor alongside the Ollama models.
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
# Fleet: 6 cloud (3 at a time) + Claude Code (sonnet) — one consolidated # Fleet: 6 cloud (3 at a time) + Claude Code (sonnet + opus) — one
# comment each, all cloud now. The local Macs (m1/m5) and the weaker # consolidated comment each, all cloud now. The local Macs (m1/m5) and
# cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were dropped as # the weaker cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were
# low-signal for gadfly's own PRs. claude-code/sonnet runs the Phase-1 # dropped as low-signal. The claude-code/* entries run the Phase-1
# engine as a competitor in its own lane (needs CLAUDE_CODE_OAUTH_TOKEN). # engine as competitors in their own lane (need CLAUDE_CODE_OAUTH_TOKEN).
GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet" GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet,claude-code/opus"
# cloud runs 3 at once; claude-code one at a time; both lanes parallel. # cloud runs 3 at once; claude-code 2 at a time; both lanes parallel.
GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=1" GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=2"
# 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget). # 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget).
GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3" GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3"
# Default => the 3-lens suite (security, correctness, error-handling). # Default => the 3-lens suite (security, correctness, error-handling).
+8 -2
View File
@@ -89,12 +89,18 @@ parses the result and runs the same verdict-parse → recheck → consolidate
CLI is bundled in the image (Node + `@anthropic-ai/claude-code`). CLI is bundled in the image (Node + `@anthropic-ai/claude-code`).
Select it as a model id — bare `claude-code` (CLI default model) or `claude-code/<model>` (the Select it as a model id — bare `claude-code` (CLI default model) or `claude-code/<model>` (the
suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`): suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`). An optional
`:<thinking>` suffix forces an extended-thinking budget for that reviewer — `:max` (the high
"ultrathink" tier) or `:<n>` for a specific token budget — so you can run the same model at two
thinking depths as separate reviewers:
```yaml ```yaml
GADFLY_MODELS: "claude-code/sonnet,claude-code/opus" GADFLY_MODELS: "claude-code/sonnet,claude-code/opus,claude-code/opus:max"
``` ```
The thinking budget is applied via the `MAX_THINKING_TOKENS` env on the CLI subprocess; it's
best-effort (a no-op if the installed CLI build doesn't honor it).
Auth is read from the environment: the default is a **Pro/Max subscription** via Auth is read from the environment: the default is a **Pro/Max subscription** via
`CLAUDE_CODE_OAUTH_TOKEN` (from `claude setup-token`; no `--bare`), falling back to `CLAUDE_CODE_OAUTH_TOKEN` (from `claude setup-token`; no `--bare`), falling back to
`ANTHROPIC_API_KEY`. Don't set both. Tuning knobs (all optional): `ANTHROPIC_API_KEY`. Don't set both. Tuning knobs (all optional):
+46 -9
View File
@@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"strconv"
"strings" "strings"
"syscall" "syscall"
"unicode/utf8" "unicode/utf8"
@@ -56,8 +57,14 @@ type claudeCodeEngine struct {
permissionMode string // --permission-mode (default "plan": read-only, no edits) permissionMode string // --permission-mode (default "plan": read-only, no edits)
allowedTools string // --allowedTools value, passed verbatim ("" = omit) allowedTools string // --allowedTools value, passed verbatim ("" = omit)
extraArgs []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS) extraArgs []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS)
thinkingTokens int // MAX_THINKING_TOKENS for the subprocess; 0 = leave default
} }
// maxThinkingTokens is the extended-thinking budget used for a "claude-code/<model>:max"
// spec — Claude Code's high "ultrathink" tier. Set as MAX_THINKING_TOKENS on the
// subprocess; harmless (a no-op) if the CLI build doesn't honor it.
const maxThinkingTokens = 31999
// isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code // isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code
// engine: the bare id "claude-code" or a "claude-code/<model>" form. // engine: the bare id "claude-code" or a "claude-code/<model>" form.
func isClaudeCodeSpec(model string) bool { func isClaudeCodeSpec(model string) bool {
@@ -66,17 +73,30 @@ func isClaudeCodeSpec(model string) bool {
} }
// newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the // newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the
// optional GADFLY_CLAUDE_* overrides. The model after the slash in // optional GADFLY_CLAUDE_* overrides. The part after the slash in
// "claude-code/<model>" becomes --model (e.g. "claude-code/sonnet" → "sonnet"); // "claude-code/<model>[:<thinking>]" becomes --model (e.g. "claude-code/sonnet"
// GADFLY_CLAUDE_MODEL overrides it. It does not verify the CLI is installed — // → "sonnet"), with an optional thinking suffix: ":max" forces the high
// a missing binary surfaces as a normal pass error (advisory, never fatal). // extended-thinking budget and ":<n>" sets a specific MAX_THINKING_TOKENS.
// GADFLY_CLAUDE_MODEL overrides the model id (the thinking suffix still applies).
// It does not verify the CLI is installed — a missing binary surfaces as a normal
// pass error (advisory, never fatal).
func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine { func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine {
model := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")) var model string
if model == "" { thinking := 0
if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok { if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok {
model = strings.TrimSpace(after) after = strings.TrimSpace(after)
// Optional ":<thinking>" suffix. Claude model aliases/ids contain no ":",
// so a colon unambiguously separates the thinking tier here.
if m, t, hasColon := strings.Cut(after, ":"); hasColon {
model = strings.TrimSpace(m)
thinking = parseThinking(strings.TrimSpace(t))
} else {
model = after
} }
} }
if env := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")); env != "" {
model = env
}
return &claudeCodeEngine{ return &claudeCodeEngine{
bin: envOr("GADFLY_CLAUDE_BIN", "claude"), bin: envOr("GADFLY_CLAUDE_BIN", "claude"),
model: model, model: model,
@@ -84,9 +104,22 @@ func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine {
permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"), permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"),
allowedTools: strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")), allowedTools: strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")),
extraArgs: strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")), extraArgs: strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")),
thinkingTokens: thinking,
} }
} }
// parseThinking maps a spec thinking suffix to a MAX_THINKING_TOKENS budget:
// "max" → maxThinkingTokens, a positive integer → itself, anything else → 0 (off).
func parseThinking(s string) int {
if strings.EqualFold(s, "max") {
return maxThinkingTokens
}
if n, err := strconv.Atoi(s); err == nil && n > 0 {
return n
}
return 0
}
// args assembles the `claude` argv for one pass. Factored out (and pure) so it // args assembles the `claude` argv for one pass. Factored out (and pure) so it
// can be unit-tested without invoking the CLI. The system prompt is layered on // can be unit-tested without invoking the CLI. The system prompt is layered on
// top of Claude Code's own via --append-system-prompt; the task is the -p // top of Claude Code's own via --append-system-prompt; the task is the -p
@@ -116,6 +149,10 @@ func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ i
cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...) cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...)
cmd.Dir = e.repoDir cmd.Dir = e.repoDir
cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI
if e.thinkingTokens > 0 {
// Force an extended-thinking budget for this run (a "...:max" spec).
cmd.Env = append(cmd.Env, "MAX_THINKING_TOKENS="+strconv.Itoa(e.thinkingTokens))
}
// Put the CLI and the Node children it spawns in their own process group and // Put the CLI and the Node children it spawns in their own process group and
// kill the WHOLE group on context cancel, so a timed-out lens can't leave // kill the WHOLE group on context cancel, so a timed-out lens can't leave
// orphaned claude/node processes behind in the container. // orphaned claude/node processes behind in the container.
@@ -184,7 +221,7 @@ func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ i
func claudeEnv() []string { func claudeEnv() []string {
keep := func(k string) bool { keep := func(k string) bool {
switch k { switch k {
case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL": case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL", "MAX_THINKING_TOKENS":
return true return true
} }
return strings.HasPrefix(k, "LC_") || return strings.HasPrefix(k, "LC_") ||
+62
View File
@@ -217,3 +217,65 @@ func TestRunPassNonZeroNoJSON(t *testing.T) {
t.Fatalf("non-zero exit should error with detail, got %v", err) t.Fatalf("non-zero exit should error with detail, got %v", err)
} }
} }
func TestClaudeCodeThinking(t *testing.T) {
t.Setenv("GADFLY_CLAUDE_MODEL", "")
cases := []struct {
spec string
wantModel string
wantThink int
}{
{"claude-code/opus", "opus", 0},
{"claude-code/opus:max", "opus", maxThinkingTokens},
{"claude-code/sonnet:20000", "sonnet", 20000},
{"claude-code/opus:bogus", "opus", 0}, // unrecognized suffix -> off
{"claude-code", "", 0},
}
for _, c := range cases {
e := newClaudeCodeEngine(c.spec, "/repo")
if e.model != c.wantModel || e.thinkingTokens != c.wantThink {
t.Errorf("newClaudeCodeEngine(%q) = (model %q, think %d), want (%q, %d)",
c.spec, e.model, e.thinkingTokens, c.wantModel, c.wantThink)
}
}
}
func TestClaudeCodeThinkingEnvOverrideKeepsSuffix(t *testing.T) {
// GADFLY_CLAUDE_MODEL overrides the model id, but the :max thinking from the
// spec still applies.
t.Setenv("GADFLY_CLAUDE_MODEL", "claude-opus-4-8")
e := newClaudeCodeEngine("claude-code/opus:max", "/repo")
if e.model != "claude-opus-4-8" {
t.Errorf("model = %q, want claude-opus-4-8 (env override)", e.model)
}
if e.thinkingTokens != maxThinkingTokens {
t.Errorf("thinkingTokens = %d, want %d (suffix still applies)", e.thinkingTokens, maxThinkingTokens)
}
}
// TestRunPassInjectsThinkingTokens verifies the engine actually puts
// MAX_THINKING_TOKENS into the subprocess env for a ":max" spec (and not for a
// plain spec). The stub echoes the value it received back as the result.
func TestRunPassInjectsThinkingTokens(t *testing.T) {
t.Setenv("GADFLY_CLAUDE_MODEL", "")
t.Setenv("MAX_THINKING_TOKENS", "") // not in the test's own env
dir := t.TempDir()
stub := dir + "/claude-stub.sh"
// Report the env var the CLI was launched with.
script := "#!/bin/sh\nprintf '{\"result\":\"MTT=%s\",\"is_error\":false}' \"${MAX_THINKING_TOKENS:-unset}\"\n"
if err := os.WriteFile(stub, []byte(script), 0o755); err != nil {
t.Fatal(err)
}
maxEng := newClaudeCodeEngine("claude-code/opus:max", dir)
maxEng.bin = stub
if out, err := maxEng.runPass(context.Background(), "s", "t", 0); err != nil || out != "MTT=31999" {
t.Fatalf(":max run: got (%q, %v), want (MTT=31999, nil)", out, err)
}
plainEng := newClaudeCodeEngine("claude-code/opus", dir)
plainEng.bin = stub
if out, err := plainEng.runPass(context.Background(), "s", "t", 0); err != nil || out != "MTT=unset" {
t.Fatalf("plain run: got (%q, %v), want (MTT=unset, nil)", out, err)
}
}