From af2d3a29389fd323af88cf30a526e57ed2d934c3 Mon Sep 17 00:00:00 2001
From: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Date: Sat, 27 Jun 2026 18:23:23 -0400
Subject: [PATCH] feat: claude-code opus reviewer + max-thinking spec support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per Steve: add Claude Code opus to gadfly's own swarm, and prep a
max-thinking variant.

- Dogfood workflow: add claude-code/opus alongside claude-code/sonnet
  (claude-code lane bumped to 2 so they run in parallel), and bump the
  image pin to :sha-80d8f53 so the clean-lens telemetry fix from #4 is
  actually live in dogfood reviews.
- Engine: a "claude-code/<model>:<thinking>" spec now sets an extended-
  thinking budget for that run via MAX_THINKING_TOKENS on the subprocess
  — ":max" (high ultrathink tier) or ":<n>". Best-effort (a no-op if the
  CLI build ignores it); harmless, never errors. This ships the capability
  so a follow-up can enable claude-code/opus:max once this image builds
  (the currently-pinned image predates the parse and would mis-route it).
- README documents the :thinking suffix; new tests cover the spec parse.

gofmt clean, go vet quiet, go test -race green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitea/workflows/adversarial-review.yml | 18 ++++-----
 README.md                               | 10 ++++-
 cmd/gadfly/engine.go                    | 53 +++++++++++++++++++++----
 cmd/gadfly/engine_test.go               | 35 ++++++++++++++++
 4 files changed, 97 insertions(+), 19 deletions(-)
diff --git a/.gitea/workflows/adversarial-review.yml b/.gitea/workflows/adversarial-review.yml
index b58834e..6b913d9 100644
--- a/.gitea/workflows/adversarial-review.yml
+++ b/.gitea/workflows/adversarial-review.yml
@@ -45,7 +45,7 @@ jobs:
     # with the 3-lens suite. All cloud now, so runs are fast.
     timeout-minutes: 90
     steps:
-      - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-86f12c1
+      - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:sha-80d8f53
         env:
           GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }}
           GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
@@ -54,14 +54,14 @@ jobs:
           # below): Pro/Max subscription token. Dogfoods the Phase-1 engine on
           # gadfly's own PRs as a competitor alongside the Ollama models.
           CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
-          # Fleet: 6 cloud (3 at a time) + Claude Code (sonnet) — one consolidated
-          # comment each, all cloud now. The local Macs (m1/m5) and the weaker
-          # cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were dropped as
-          # low-signal for gadfly's own PRs. claude-code/sonnet runs the Phase-1
-          # engine as a competitor in its own lane (needs CLAUDE_CODE_OAUTH_TOKEN).
-          GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet"
-          # cloud runs 3 at once; claude-code one at a time; both lanes parallel.
-          GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=1"
+          # Fleet: 6 cloud (3 at a time) + Claude Code (sonnet + opus) — one
+          # consolidated comment each, all cloud now. The local Macs (m1/m5) and
+          # the weaker cloud models (gemma4, gpt-oss:120b, kimi-k2.7-code) were
+          # dropped as low-signal. The claude-code/* entries run the Phase-1
+          # engine as competitors in their own lane (need CLAUDE_CODE_OAUTH_TOKEN).
+          GADFLY_MODELS: "minimax-m3:cloud,glm-5.2:cloud,glm-5.1:cloud,deepseek-v4-pro:cloud,nemotron-3-super:cloud,qwen3-coder:480b-cloud,claude-code/sonnet,claude-code/opus"
+          # cloud runs 3 at once; claude-code 2 at a time; both lanes parallel.
+          GADFLY_PROVIDER_CONCURRENCY: "ollama-cloud=3,claude-code=2"
           # 3 cloud models x 3 lenses = 9 concurrent ollama-cloud queries (under the 10 budget).
           GADFLY_PROVIDER_LENS_CONCURRENCY: "ollama-cloud=3"
           # Default => the 3-lens suite (security, correctness, error-handling).
diff --git a/README.md b/README.md
index 77870e1..348c222 100644
--- a/README.md
+++ b/README.md
@@ -89,12 +89,18 @@ parses the result and runs the same verdict-parse → recheck → consolidate 
 CLI is bundled in the image (Node + `@anthropic-ai/claude-code`).
 
 Select it as a model id — bare `claude-code` (CLI default model) or `claude-code/<model>` (the
-suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`):
+suffix becomes `--model`, e.g. `claude-code/sonnet`, `claude-code/opus`). An optional
+`:<thinking>` suffix forces an extended-thinking budget for that reviewer — `:max` (the high
+"ultrathink" tier) or `:<n>` for a specific token budget — so you can run the same model at two
+thinking depths as separate reviewers:
 
 ```yaml
-GADFLY_MODELS: "claude-code/sonnet,claude-code/opus"
+GADFLY_MODELS: "claude-code/sonnet,claude-code/opus,claude-code/opus:max"
 ```
 
+The thinking budget is applied via the `MAX_THINKING_TOKENS` env on the CLI subprocess; it's
+best-effort (a no-op if the installed CLI build doesn't honor it).
+
 Auth is read from the environment: the default is a **Pro/Max subscription** via
 `CLAUDE_CODE_OAUTH_TOKEN` (from `claude setup-token`; no `--bare`), falling back to
 `ANTHROPIC_API_KEY`. Don't set both. Tuning knobs (all optional):
diff --git a/cmd/gadfly/engine.go b/cmd/gadfly/engine.go
index 41aa7c8..4c6d8b0 100644
--- a/cmd/gadfly/engine.go
+++ b/cmd/gadfly/engine.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"strconv"
 	"strings"
 	"syscall"
 	"unicode/utf8"
@@ -56,8 +57,14 @@ type claudeCodeEngine struct {
 	permissionMode string   // --permission-mode (default "plan": read-only, no edits)
 	allowedTools   string   // --allowedTools value, passed verbatim ("" = omit)
 	extraArgs      []string // appended verbatim (GADFLY_CLAUDE_EXTRA_ARGS)
+	thinkingTokens int      // MAX_THINKING_TOKENS for the subprocess; 0 = leave default
 }
 
+// maxThinkingTokens is the extended-thinking budget used for a "claude-code/<model>:max"
+// spec — Claude Code's high "ultrathink" tier. Set as MAX_THINKING_TOKENS on the
+// subprocess; harmless (a no-op) if the CLI build doesn't honor it.
+const maxThinkingTokens = 31999
+
 // isClaudeCodeSpec reports whether a GADFLY_MODEL spec selects the claude-code
 // engine: the bare id "claude-code" or a "claude-code/<model>" form.
 func isClaudeCodeSpec(model string) bool {
@@ -66,17 +73,30 @@ func isClaudeCodeSpec(model string) bool {
 }
 
 // newClaudeCodeEngine builds the engine from the GADFLY_MODEL spec and the
-// optional GADFLY_CLAUDE_* overrides. The model after the slash in
-// "claude-code/<model>" becomes --model (e.g. "claude-code/sonnet" → "sonnet");
-// GADFLY_CLAUDE_MODEL overrides it. It does not verify the CLI is installed —
-// a missing binary surfaces as a normal pass error (advisory, never fatal).
+// optional GADFLY_CLAUDE_* overrides. The part after the slash in
+// "claude-code/<model>[:<thinking>]" becomes --model (e.g. "claude-code/sonnet"
+// → "sonnet"), with an optional thinking suffix: ":max" forces the high
+// extended-thinking budget and ":<n>" sets a specific MAX_THINKING_TOKENS.
+// GADFLY_CLAUDE_MODEL overrides the model id (the thinking suffix still applies).
+// It does not verify the CLI is installed — a missing binary surfaces as a normal
+// pass error (advisory, never fatal).
 func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine {
-	model := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL"))
-	if model == "" {
-		if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok {
-			model = strings.TrimSpace(after)
+	var model string
+	thinking := 0
+	if _, after, ok := strings.Cut(strings.TrimSpace(spec), "/"); ok {
+		after = strings.TrimSpace(after)
+		// Optional ":<thinking>" suffix. Claude model aliases/ids contain no ":",
+		// so a colon unambiguously separates the thinking tier here.
+		if m, t, hasColon := strings.Cut(after, ":"); hasColon {
+			model = strings.TrimSpace(m)
+			thinking = parseThinking(strings.TrimSpace(t))
+		} else {
+			model = after
 		}
 	}
+	if env := strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_MODEL")); env != "" {
+		model = env
+	}
 	return &claudeCodeEngine{
 		bin:            envOr("GADFLY_CLAUDE_BIN", "claude"),
 		model:          model,
@@ -84,9 +104,22 @@ func newClaudeCodeEngine(spec, repoDir string) *claudeCodeEngine {
 		permissionMode: envOr("GADFLY_CLAUDE_PERMISSION_MODE", "plan"),
 		allowedTools:   strings.TrimSpace(os.Getenv("GADFLY_CLAUDE_ALLOWED_TOOLS")),
 		extraArgs:      strings.Fields(os.Getenv("GADFLY_CLAUDE_EXTRA_ARGS")),
+		thinkingTokens: thinking,
 	}
 }
 
+// parseThinking maps a spec thinking suffix to a MAX_THINKING_TOKENS budget:
+// "max" → maxThinkingTokens, a positive integer → itself, anything else → 0 (off).
+func parseThinking(s string) int {
+	if strings.EqualFold(s, "max") {
+		return maxThinkingTokens
+	}
+	if n, err := strconv.Atoi(s); err == nil && n > 0 {
+		return n
+	}
+	return 0
+}
+
 // args assembles the `claude` argv for one pass. Factored out (and pure) so it
 // can be unit-tested without invoking the CLI. The system prompt is layered on
 // top of Claude Code's own via --append-system-prompt; the task is the -p
@@ -116,6 +149,10 @@ func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ i
 	cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...)
 	cmd.Dir = e.repoDir
 	cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI
+	if e.thinkingTokens > 0 {
+		// Force an extended-thinking budget for this run (a "...:max" spec).
+		cmd.Env = append(cmd.Env, "MAX_THINKING_TOKENS="+strconv.Itoa(e.thinkingTokens))
+	}
 	// Put the CLI and the Node children it spawns in their own process group and
 	// kill the WHOLE group on context cancel, so a timed-out lens can't leave
 	// orphaned claude/node processes behind in the container.
diff --git a/cmd/gadfly/engine_test.go b/cmd/gadfly/engine_test.go
index 538093d..c4b974e 100644
--- a/cmd/gadfly/engine_test.go
+++ b/cmd/gadfly/engine_test.go
@@ -217,3 +217,38 @@ func TestRunPassNonZeroNoJSON(t *testing.T) {
 		t.Fatalf("non-zero exit should error with detail, got %v", err)
 	}
 }
+
+func TestClaudeCodeThinking(t *testing.T) {
+	t.Setenv("GADFLY_CLAUDE_MODEL", "")
+	cases := []struct {
+		spec      string
+		wantModel string
+		wantThink int
+	}{
+		{"claude-code/opus", "opus", 0},
+		{"claude-code/opus:max", "opus", maxThinkingTokens},
+		{"claude-code/sonnet:20000", "sonnet", 20000},
+		{"claude-code/opus:bogus", "opus", 0}, // unrecognized suffix -> off
+		{"claude-code", "", 0},
+	}
+	for _, c := range cases {
+		e := newClaudeCodeEngine(c.spec, "/repo")
+		if e.model != c.wantModel || e.thinkingTokens != c.wantThink {
+			t.Errorf("newClaudeCodeEngine(%q) = (model %q, think %d), want (%q, %d)",
+				c.spec, e.model, e.thinkingTokens, c.wantModel, c.wantThink)
+		}
+	}
+}
+
+func TestClaudeCodeThinkingEnvOverrideKeepsSuffix(t *testing.T) {
+	// GADFLY_CLAUDE_MODEL overrides the model id, but the :max thinking from the
+	// spec still applies.
+	t.Setenv("GADFLY_CLAUDE_MODEL", "claude-opus-4-8")
+	e := newClaudeCodeEngine("claude-code/opus:max", "/repo")
+	if e.model != "claude-opus-4-8" {
+		t.Errorf("model = %q, want claude-opus-4-8 (env override)", e.model)
+	}
+	if e.thinkingTokens != maxThinkingTokens {
+		t.Errorf("thinkingTokens = %d, want %d (suffix still applies)", e.thinkingTokens, maxThinkingTokens)
+	}
+}