diff --git a/README.md b/README.md index bb07973..5d6bcfe 100644 --- a/README.md +++ b/README.md @@ -103,9 +103,18 @@ Auth is read from the environment: the default is a **Pro/Max subscription** via | `GADFLY_CLAUDE_MODEL` | *(from the spec suffix)* | overrides the `--model` value | | `GADFLY_CLAUDE_PERMISSION_MODE` | `plan` | `--permission-mode` (read-only `plan` keeps it from editing) | | `GADFLY_CLAUDE_ALLOWED_TOOLS` | *(unset)* | `--allowedTools` value, passed verbatim (e.g. `Read,Grep,Glob`) | -| `GADFLY_CLAUDE_EXTRA_ARGS` | *(unset)* | extra CLI args appended verbatim (e.g. `--max-turns 30`) | +| `GADFLY_CLAUDE_EXTRA_ARGS` | *(unset)* | extra CLI args, **whitespace-split** (no shell quoting) and appended after the defaults (e.g. `--max-turns 30`) | | `GADFLY_CLAUDE_BIN` | `claude` | CLI binary path | +> These are **operator** knobs (workflow env), not PR-author input. Because +> `GADFLY_CLAUDE_EXTRA_ARGS` is appended *after* the defaults, it can override the +> read-only `--permission-mode plan` (e.g. passing `--permission-mode acceptEdits`), +> so keep it read-only unless you mean otherwise. It's whitespace-split, so values +> can't contain spaces — use `GADFLY_CLAUDE_ALLOWED_TOOLS` / `_PERMISSION_MODE` / +> `_MODEL` for those. The subprocess runs with a **minimal environment** (its auth +> token + `PATH`/`HOME`/locale/`GADFLY_CLAUDE_*`), not the runner's full env, so the +> Gitea token and provider keys aren't handed to the CLI. + > **Untested, like the cloud providers.** This wires the CLI in and is exercised by its unit > tests, but a live subscription-auth run hasn't been validated end-to-end here — and using > subscription auth in automated CI is a gray area in Anthropic's terms. `auto` specialist diff --git a/cmd/gadfly/engine.go b/cmd/gadfly/engine.go index 160a9d7..41aa7c8 100644 --- a/cmd/gadfly/engine.go +++ b/cmd/gadfly/engine.go @@ -8,6 +8,8 @@ import ( "os" "os/exec" "strings" + "syscall" + "unicode/utf8" llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) @@ -113,47 +115,107 @@ type claudeResult struct { func (e *claudeCodeEngine) runPass(ctx context.Context, system, task string, _ int) (string, error) { cmd := exec.CommandContext(ctx, e.bin, e.args(system, task)...) cmd.Dir = e.repoDir - cmd.Env = os.Environ() // inherits CLAUDE_CODE_OAUTH_TOKEN / ANTHROPIC_API_KEY + cmd.Env = claudeEnv() // minimal env — don't hand GITEA_TOKEN et al. to the CLI + // Put the CLI and the Node children it spawns in their own process group and + // kill the WHOLE group on context cancel, so a timed-out lens can't leave + // orphaned claude/node processes behind in the container. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + cmd.Cancel = func() error { + if cmd.Process != nil { + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + } + return nil + } var stdout, stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr runErr := cmd.Run() - // Prefer a structured answer; the CLI prints JSON even on some failures. + // A cancelled/timed-out run must surface as an error, never as whatever + // partial bytes the CLI flushed before it was killed. + if ctx.Err() != nil { + return "", fmt.Errorf("claude -p %v", ctx.Err()) + } + var res claudeResult - if jerr := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &res); jerr == nil { + parsed := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &res) == nil + + // Clean exit: trust the parsed JSON answer, and ONLY it — never fall back to + // the raw JSON envelope when the result is empty. + if runErr == nil && parsed { if res.IsError { return "", fmt.Errorf("claude reported error (%s): %s", res.Subtype, truncateForErr(res.Result)) } if out := strings.TrimSpace(res.Result); out != "" { return out, nil } + return "", fmt.Errorf("claude -p returned an empty result") } if runErr != nil { + // Prefer the CLI's own structured error message when it gave one. + if parsed && res.IsError && strings.TrimSpace(res.Result) != "" { + return "", fmt.Errorf("claude reported error (%s): %s", res.Subtype, truncateForErr(res.Result)) + } detail := truncateForErr(stderr.String()) if detail == "" { detail = truncateForErr(stdout.String()) } - return "", fmt.Errorf("claude -p failed: %v: %s", runErr, detail) + if detail != "" { + return "", fmt.Errorf("claude -p failed: %v: %s", runErr, detail) + } + return "", fmt.Errorf("claude -p failed: %v", runErr) } - // Ran cleanly but we couldn't pull a result out of the JSON: fall back to - // raw stdout so a format change degrades to "use the text" instead of empty. + + // Clean exit but stdout wasn't the expected JSON envelope: degrade to the raw + // text so a CLI format change still yields a review instead of nothing. if raw := strings.TrimSpace(stdout.String()); raw != "" { return raw, nil } return "", fmt.Errorf("claude -p produced no parseable output") } -// truncateForErr caps CLI error detail so a stderr dump can't bloat the comment. +// claudeEnv builds a minimal environment for the `claude` subprocess: only what +// the CLI needs (PATH/HOME, its auth tokens, locale, Node/XDG/GADFLY_CLAUDE_* +// knobs), deliberately dropping the rest of the runner's secrets — GITEA_TOKEN, +// GADFLY_FINDINGS_TOKEN, provider keys — so they never reach the third-party +// CLI. Defense in depth: the parent already holds them, but the CLI has no need. +func claudeEnv() []string { + keep := func(k string) bool { + switch k { + case "PATH", "HOME", "USER", "LOGNAME", "TMPDIR", "LANG", "TERM", "SHELL": + return true + } + return strings.HasPrefix(k, "LC_") || + strings.HasPrefix(k, "CLAUDE_") || + strings.HasPrefix(k, "ANTHROPIC_") || + strings.HasPrefix(k, "GADFLY_CLAUDE_") || + strings.HasPrefix(k, "NODE_") || + strings.HasPrefix(k, "XDG_") + } + var env []string + for _, kv := range os.Environ() { + if k, _, ok := strings.Cut(kv, "="); ok && keep(k) { + env = append(env, kv) + } + } + return env +} + +// truncateForErr caps CLI error detail so a stderr dump can't bloat the comment, +// cutting on a rune boundary so it never emits invalid UTF-8. func truncateForErr(s string) string { s = strings.TrimSpace(s) const max = 800 - if len(s) > max { - return s[:max] + "…" + if len(s) <= max { + return s } - return s + cut := max + for cut > 0 && !utf8.RuneStart(s[cut]) { + cut-- + } + return s[:cut] + "…" } // envOr returns the env var value or a default when unset/blank. diff --git a/cmd/gadfly/engine_test.go b/cmd/gadfly/engine_test.go index 69dce18..538093d 100644 --- a/cmd/gadfly/engine_test.go +++ b/cmd/gadfly/engine_test.go @@ -1,9 +1,12 @@ package main import ( + "context" + "os" "slices" "strings" "testing" + "unicode/utf8" ) func TestIsClaudeCodeSpec(t *testing.T) { @@ -113,3 +116,104 @@ func TestClaudeCodeArgsBareModelOmitsFlag(t *testing.T) { t.Errorf("--allowedTools should be omitted when blank: %v", args) } } + +func TestClaudeEnvFilters(t *testing.T) { + t.Setenv("GITEA_TOKEN", "secret-gitea") + t.Setenv("OLLAMA_API_KEY", "secret-ollama") + t.Setenv("GADFLY_API_KEY", "secret-gadfly") + t.Setenv("GADFLY_FINDINGS_TOKEN", "secret-findings") + t.Setenv("CLAUDE_CODE_OAUTH_TOKEN", "keep-claude") + t.Setenv("ANTHROPIC_API_KEY", "keep-anthropic") + t.Setenv("GADFLY_CLAUDE_MODEL", "keep-knob") + + env := claudeEnv() + has := func(k string) bool { + for _, kv := range env { + if strings.HasPrefix(kv, k+"=") { + return true + } + } + return false + } + // kept: the CLI's auth + its own knobs + PATH + for _, k := range []string{"CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY", "GADFLY_CLAUDE_MODEL", "PATH"} { + if !has(k) { + t.Errorf("claudeEnv dropped %s, but it should be kept", k) + } + } + // dropped: the runner's secrets the CLI doesn't need + for _, k := range []string{"GITEA_TOKEN", "OLLAMA_API_KEY", "GADFLY_API_KEY", "GADFLY_FINDINGS_TOKEN"} { + if has(k) { + t.Errorf("claudeEnv leaked %s into the subprocess env", k) + } + } +} + +func TestTruncateForErrRuneSafe(t *testing.T) { + // 900 multibyte runes (3 bytes each) -> well over the 800-byte cap; the cut + // must land on a rune boundary so the result stays valid UTF-8. + s := strings.Repeat("€", 900) + got := truncateForErr(s) + if !utf8.ValidString(got) { + t.Fatalf("truncateForErr produced invalid UTF-8") + } + if !strings.HasSuffix(got, "…") { + t.Fatalf("truncateForErr should append an ellipsis when truncating") + } + // short strings pass through untouched + if truncateForErr(" hi ") != "hi" { + t.Fatalf("truncateForErr should trim and pass short strings through") + } +} + +// stubClaude writes an executable shell stub that prints body and exits code, +// and returns an engine pointed at it. +func stubClaude(t *testing.T, body string, code int) *claudeCodeEngine { + t.Helper() + dir := t.TempDir() + path := dir + "/claude-stub.sh" + script := "#!/bin/sh\nprintf '%s' " + shSingleQuote(body) + "\nexit " + itoa(code) + "\n" + if err := os.WriteFile(path, []byte(script), 0o755); err != nil { + t.Fatal(err) + } + return &claudeCodeEngine{bin: path, repoDir: dir} +} + +func shSingleQuote(s string) string { return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" } +func itoa(i int) string { return string(rune('0' + i)) } // single-digit exit codes only + +func TestRunPassCleanResult(t *testing.T) { + e := stubClaude(t, `{"result":"REVIEW TEXT","is_error":false}`, 0) + out, err := e.runPass(context.Background(), "sys", "task", 0) + if err != nil || out != "REVIEW TEXT" { + t.Fatalf("clean result: got (%q, %v), want (REVIEW TEXT, nil)", out, err) + } +} + +func TestRunPassEmptyResultIsError(t *testing.T) { + // JSON parses, exit 0, but result empty: must NOT return the raw JSON blob. + e := stubClaude(t, `{"result":"","is_error":false}`, 0) + out, err := e.runPass(context.Background(), "sys", "task", 0) + if err == nil { + t.Fatalf("empty result should be an error, got out=%q", out) + } + if strings.Contains(out, "{") { + t.Fatalf("empty result must not leak raw JSON, got %q", out) + } +} + +func TestRunPassIsErrorFlag(t *testing.T) { + e := stubClaude(t, `{"result":"boom","is_error":true,"subtype":"error_max_turns"}`, 0) + _, err := e.runPass(context.Background(), "sys", "task", 0) + if err == nil || !strings.Contains(err.Error(), "claude reported error") { + t.Fatalf("is_error should surface as an error, got %v", err) + } +} + +func TestRunPassNonZeroNoJSON(t *testing.T) { + e := stubClaude(t, "fatal: auth failed", 1) + _, err := e.runPass(context.Background(), "sys", "task", 0) + if err == nil || !strings.Contains(err.Error(), "claude -p failed") { + t.Fatalf("non-zero exit should error with detail, got %v", err) + } +} diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index 6f29944..20f3564 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -431,7 +431,7 @@ func buildTask(diff string) string { truncNote := "" if maxDiff > 0 && len(diff) > maxDiff { diff = diff[:maxDiff] - truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff) + truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; read the changed files (or call get_diff, if available) for the full text.]", maxDiff) } var b strings.Builder @@ -441,7 +441,7 @@ func buildTask(diff string) string { if strings.TrimSpace(body) != "" { fmt.Fprintf(&b, "PR description:\n%s\n\n", body) } - b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n") + b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your repository read tools to verify it against the actual checked-out code — do not rely on the diff alone.\n\n") fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote) return b.String() } diff --git a/cmd/gadfly/recheck.go b/cmd/gadfly/recheck.go index 1043914..0255744 100644 --- a/cmd/gadfly/recheck.go +++ b/cmd/gadfly/recheck.go @@ -16,15 +16,14 @@ const defaultRecheckMaxSteps = 16 // against the real code before letting it survive — the antidote to a // single-pass reviewer that reads a couple of files, mis-connects them, and // posts a confident but wrong "blocking" verdict. -const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the -"mort" project (a large Go Discord bot). You are given a DRAFT review produced -by another model. Your job is NOT to write a new review — it is to confirm or -reject each finding in the draft against the ACTUAL code, then output the -corrected review. +const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review. You are +given a DRAFT review produced by another model. Your job is NOT to write a new +review — it is to confirm or reject each finding in the draft against the ACTUAL +code, then output the corrected review. -You have the same read-only repository tools as the original reviewer: -- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path, - max_results]), find_files(name[, max_results]), get_diff(). +You have read-only access to the checked-out repository — use your tools to read +files and search the code to independently verify each finding against the real +source. For EVERY finding in the draft: 1. Independently reproduce the reasoning by reading the actual files with your @@ -84,7 +83,7 @@ func buildRecheckTask(draft, diff string) string { truncNote := "" if maxDiff > 0 && len(diff) > maxDiff { diff = diff[:maxDiff] - truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff) + truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; read the changed files (or call get_diff, if available) for the full text.]", maxDiff) } var b strings.Builder