commit c0d0152a34c95bbafd46773a2a1f546215d93f18 Author: Steve Dudenhoeffer Date: Thu Jun 25 18:42:20 2026 -0400 Gadfly: agentic adversarial PR reviewer (initial extraction) Standalone, Docker-packaged extraction of the agentic PR reviewer that runs in Gitea Actions: reads the checked-out repo with read-only tools (read_file/grep/ find_files/get_diff), verifies findings before reporting, two-pass review + adversarial recheck, posts one labeled comment per model. Advisory only. - cmd/gadfly: reviewer binary (majordomo + Ollama Cloud), zero deps beyond stdlib + majordomo - entrypoint.sh: container brains — trigger gating, PR clone, model loop (logic out of YAML) - Dockerfile: multi-stage; build-time module token never reaches the final image - .gitea/workflows/build-image.yml: tag v* → build & push image - examples/: ~15-line consumer stub - system prompt genericized + hardened to re-derive constants/formulas (semantic bugs) Vibe-coded with Claude Code; see README disclosure. Advisory, never blocks merge. Co-Authored-By: Claude Opus 4.8 (1M context) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..987bf84 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +.git +*.orig.* +*.orig +README.md +examples +testdata diff --git a/.gitea/workflows/build-image.yml b/.gitea/workflows/build-image.yml new file mode 100644 index 0000000..1925442 --- /dev/null +++ b/.gitea/workflows/build-image.yml @@ -0,0 +1,47 @@ +name: Build & push image + +# Builds the Gadfly reviewer container and pushes it to the Gitea container +# registry. Tag a release (v1, v1.2.0, …) to publish that version + :latest. +# +# Required repo secrets: +# REGISTRY_USER / REGISTRY_PASSWORD Gitea creds with registry push + read +# access to the private majordomo module. + +on: + push: + tags: ["v*"] + workflow_dispatch: {} + +env: + IMAGE: gitea.stevedudenhoeffer.com/steve/gadfly + +jobs: + image: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Log in to the registry + run: | + echo "${{ secrets.REGISTRY_PASSWORD }}" \ + | docker login gitea.stevedudenhoeffer.com -u "${{ secrets.REGISTRY_USER }}" --password-stdin + + - name: Resolve tags + id: tags + run: | + if [ "${{ github.ref_type }}" = "tag" ]; then + echo "version=${{ github.ref_name }}" >> "$GITHUB_OUTPUT" + else + echo "version=dev-$(echo ${{ github.sha }} | cut -c1-8)" >> "$GITHUB_OUTPUT" + fi + + - name: Build & push + run: | + docker build \ + --build-arg GIT_USER="${{ secrets.REGISTRY_USER }}" \ + --build-arg GIT_TOKEN="${{ secrets.REGISTRY_PASSWORD }}" \ + -t "${IMAGE}:${{ steps.tags.outputs.version }}" \ + -t "${IMAGE}:latest" \ + . + docker push "${IMAGE}:${{ steps.tags.outputs.version }}" + docker push "${IMAGE}:latest" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e71ad75 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/gadfly +/out +*.orig diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fd0834c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# syntax=docker/dockerfile:1 +# +# Multi-stage so the private-module access token used to fetch the majordomo +# dependency lives ONLY in the build stage and never lands in the final image. + +FROM golang:1.26 AS build +ARG GIT_HOST=gitea.stevedudenhoeffer.com +ARG GIT_USER= +ARG GIT_TOKEN= +ENV CGO_ENABLED=0 \ + GOFLAGS=-mod=mod \ + GOSUMDB=off +ENV GOPRIVATE=${GIT_HOST}/* GONOSUMDB=${GIT_HOST}/* +WORKDIR /src +# Private Go module access (majordomo). Token is confined to this stage. +RUN if [ -n "$GIT_TOKEN" ]; then \ + git config --global url."https://${GIT_USER}:${GIT_TOKEN}@${GIT_HOST}/".insteadOf "https://${GIT_HOST}/"; \ + fi +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN go build -trimpath -ldflags="-s -w" -o /out/gadfly ./cmd/gadfly + +FROM alpine:3.20 +RUN apk add --no-cache bash git curl jq ca-certificates +COPY --from=build /out/gadfly /usr/local/bin/gadfly +COPY scripts /app/scripts +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh /app/scripts/run.sh /usr/local/bin/gadfly +ENTRYPOINT ["/entrypoint.sh"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6ab7069 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Steve Dudenhoeffer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..15153ed --- /dev/null +++ b/README.md @@ -0,0 +1,101 @@ +# 🪰 Gadfly + +**An AI gadfly for your pull requests.** Gadfly is an *adversarial* code reviewer that +runs in Gitea Actions: on every PR it reads your actual repository, hunts for real +problems, verifies them against the code, and posts its findings as a comment. It does not +praise your code. A gadfly does not let things slide. + +> ### 🤖 Heads up: this is a vibe-coded project +> Gadfly was built almost entirely by an AI agent (Claude Code), prompts and all — the +> reviewer's "brain" is a language model, and so was most of the author. It works and it's +> tested, but treat it accordingly: **it is advisory only, it never blocks a merge, and you +> should still review its reviews.** Issues and PRs welcome; expect the occasional +> AI-flavored rough edge. + +## What makes it different + +Most LLM "review my diff" bots read the diff in isolation and hallucinate problems they +can't actually see — a "missing import" that's three lines above the hunk, a "broken +caller" in a file they never opened. Gadfly is **agentic**: the model has read-only tools +over the checked-out repo and is *required* to use them before reporting anything. + +- **Tools:** `read_file`, `list_dir`, `grep`, `find_files`, `get_diff`. +- **Verify-before-claiming discipline:** baked into the system prompt — open the file, + grep the symbol, or drop the finding. +- **Two passes:** a *review* pass drafts findings, then an adversarial *recheck* pass + independently re-verifies each one against the code and drops the ones it can't confirm, + recomputing the verdict. This is what kills "confident but wrong." +- **Semantic-bug hunting:** it's told not to trust a plausible-looking constant, conversion + factor, or formula — re-derive the expected value, because that's where real bugs hide. + +Every review leads with a one-line verdict: **No material issues found**, **Minor issues**, +or **Blocking issues found**. + +## Turn it on for a repo + +Gadfly ships as a container image, so consuming repos don't build anything — they just run +it. Drop one file in your repo and set a couple of secrets/vars: + +1. Copy [`examples/adversarial-review.yml`](examples/adversarial-review.yml) to + `.gitea/workflows/adversarial-review.yml` in your repo. +2. Add repo config: + - **secret** `OLLAMA_CLOUD_API_KEY` — your [Ollama Cloud](https://ollama.com) key (empty + ⇒ Gadfly posts a harmless "not configured" notice instead of reviewing). + - **var** `OLLAMA_REVIEW_MODELS` *(optional)* — comma-separated model ids + (default `qwen3-coder:480b-cloud,gpt-oss:120b-cloud`). One comment per model. + - **var** `GADFLY_ALLOWED_USERS` *(optional)* — who may re-trigger via comment; empty ⇒ + any repo collaborator. + +`GITEA_TOKEN` is provided automatically by Actions; comments post as the `gitea-actions` +user, scoped to that repo — no bot account needed. + +### Triggers + +1. A **new/reopened/ready** non-draft PR — automatic. +2. Commenting **`@gadfly review`** on a PR — re-review on demand (gated to allowed users). +3. **workflow_dispatch** — manual, with a `pr_number` input. + +(Pushing new commits does *not* auto-re-review — comment `@gadfly review` after pushing +fixes. This keeps usage down.) + +## How it's packaged + +``` +cmd/gadfly/ the agentic reviewer binary (majordomo + Ollama Cloud); zero deps beyond stdlib + majordomo +scripts/run.sh fetches the PR diff, runs the reviewer, upserts one labeled comment +scripts/system-prompt.txt the reviewer persona + verification discipline +entrypoint.sh the container brains: trigger gating, clone, model loop (logic lives here, not in YAML) +Dockerfile multi-stage; the build-time module token never reaches the final image +.gitea/workflows/build-image.yml tags v* → build & push the image +examples/ the ~15-line stub a consuming repo drops in +``` + +The image is published to `gitea.stevedudenhoeffer.com/steve/gadfly`. Push a `v*` tag to +build and publish a new version (and `:latest`). + +## Configuration (advanced) + +The reviewer binary reads these (the stub/entrypoint set sane defaults): + +| Env | Default | Meaning | +|-----|---------|---------| +| `OLLAMA_API_KEY` | — | Ollama Cloud bearer key (required for real reviews) | +| `GADFLY_MODEL` | — | model id | +| `GADFLY_MAX_STEPS` | 24 | review-pass tool-step cap | +| `GADFLY_RECHECK` | on | set `0`/`false` to skip the recheck pass | +| `GADFLY_RECHECK_MAX_STEPS` | 16 | recheck-pass step cap | +| `GADFLY_TIMEOUT_SECS` | 300 | overall deadline (both passes) | +| `GADFLY_MAX_DIFF_CHARS` | 60000 | diff chars embedded in the prompt (full diff via `get_diff`) | +| `GADFLY_TRIGGER_PHRASE` | `@gadfly review` | comment phrase that re-triggers | +| `GADFLY_ALLOWED_USERS` | *(collaborators)* | comma-separated allow-list for comment triggers | + +## Building locally + +```sh +go build ./cmd/gadfly # needs read access to the private majordomo module +go test ./... +``` + +## License + +MIT — see [LICENSE](LICENSE). diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go new file mode 100644 index 0000000..bec65fe --- /dev/null +++ b/cmd/gadfly/main.go @@ -0,0 +1,287 @@ +// Command gadfly is the agentic backend for the PR adversarial-review +// workflow (.gitea/workflows/pr-adversarial-review.yml). Unlike the old +// one-shot chat call, it runs a tool-using agent (majordomo + Ollama Cloud) +// over the PR's CHECKED-OUT repository: the model can read_file / list_dir / +// grep / find_files / get_diff to VERIFY a finding before reporting it, which +// kills the "diff-only" false positives (claiming a missing import or a +// non-existent method it simply couldn't see). +// +// It is a pure producer of review text: it reads the diff + the repo and +// prints the review markdown to stdout. All Gitea I/O (fetching the diff, +// upserting the comment) stays in run.sh, so this binary needs no repo write +// access and is straightforward to unit-test. +// +// Two passes (unless the draft is a clean "no material issues" pass): a +// REVIEW pass produces a draft, then an adversarial RECHECK pass independently +// re-verifies every finding against the actual files with the same tools and +// drops the ones it cannot confirm, recomputing the verdict. This catches the +// "confident but wrong" findings that survive a single pass — e.g. claiming an +// env var is unset when a wrapper script sets it (see recheck.go). +// +// Inputs (env): +// +// OLLAMA_API_KEY Ollama Cloud bearer key (required). +// GADFLY_MODEL model id, e.g. "qwen3-coder:480b-cloud" (required). +// GADFLY_REPO_DIR path to the checked-out repo (required; the FS sandbox root). +// GADFLY_DIFF_FILE path to a file holding the full unified diff (required). +// GADFLY_SYSTEM_FILE path to the reviewer system prompt (required). +// GADFLY_TITLE PR title (optional). +// GADFLY_BODY PR description (optional). +// GADFLY_MAX_STEPS review-pass step cap (optional, default 24). +// GADFLY_WRAPUP_RESERVE steps before the cap at which the agent is told to +// stop investigating and write its answer (optional, +// default 4). Plus a tool-free finalization fallback +// guarantees a step-exhausted pass still emits output. +// GADFLY_RECHECK set to 0/false to skip the recheck pass (optional, default on). +// GADFLY_RECHECK_MAX_STEPS recheck-pass step cap (optional, default 16). +// GADFLY_TIMEOUT_SECS overall deadline in seconds, shared by both passes (optional, default 300). +// GADFLY_MAX_DIFF_CHARS diff chars embedded in the prompt (optional, default 60000; +// the full diff is always available via the get_diff tool). +// +// On success it prints the review to stdout and exits 0. On a usage/config or +// model error it prints a diagnostic to stderr and exits non-zero; run.sh then +// posts a "reviewer failed" notice (advisory — never fails the CI job). +package main + +import ( + "context" + "errors" + "fmt" + "os" + "strconv" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/majordomo/agent" + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama" +) + +const ( + defaultMaxSteps = 24 + defaultTimeoutSecs = 300 + defaultMaxDiffChars = 60000 + // defaultWrapUpReserve is how many steps before the cap the agent is told + // to stop investigating and write its final answer. Reserving a margin is + // what keeps a thorough reviewer from spending its whole budget on tool + // calls and then hard-failing with "max steps reached without a final + // answer" — it always has a few steps left to wrap up. + defaultWrapUpReserve = 4 +) + +// wrapUpInstruction is steered into a running agent once it comes within the +// wrap-up reserve of its step cap: a forceful nudge to stop calling tools and +// emit the final answer using only what it has already gathered. +const wrapUpInstruction = "⚠️ You are almost out of your investigation budget — only a few tool steps remain. " + + "STOP calling tools now and write your FINAL answer immediately, using only what you have already verified. " + + "Do not begin any new investigation. If a finding could not be confirmed, drop it or mark it explicitly as unverified. " + + "Output the review in the required format right now." + +// finalizeInstruction is the user message sent on the tool-free fallback pass +// when the agent exhausted its budget (or tripped a loop guard) without ever +// producing a final answer. It forces the model to synthesize whatever it has. +const finalizeInstruction = "You have run out of investigation steps. Do NOT call any tools. " + + "Based solely on what you have already gathered above, write your final answer now in the required format. " + + "If you could not confirm some findings, omit them or mark them as unverified, but produce the answer." + +func main() { + if err := run(); err != nil { + fmt.Fprintln(os.Stderr, "gadfly:", err) + os.Exit(1) + } +} + +func run() error { + apiKey := os.Getenv("OLLAMA_API_KEY") + if apiKey == "" { + return errors.New("OLLAMA_API_KEY is required") + } + model := os.Getenv("GADFLY_MODEL") + repoDir := os.Getenv("GADFLY_REPO_DIR") + diffFile := os.Getenv("GADFLY_DIFF_FILE") + systemFile := os.Getenv("GADFLY_SYSTEM_FILE") + if model == "" || repoDir == "" || diffFile == "" || systemFile == "" { + return errors.New("GADFLY_MODEL, GADFLY_REPO_DIR, GADFLY_DIFF_FILE and GADFLY_SYSTEM_FILE are all required") + } + + diffBytes, err := os.ReadFile(diffFile) + if err != nil { + return fmt.Errorf("read diff file: %w", err) + } + diff := string(diffBytes) + if strings.TrimSpace(diff) == "" { + return errors.New("empty diff; nothing to review") + } + + systemBytes, err := os.ReadFile(systemFile) + if err != nil { + return fmt.Errorf("read system prompt: %w", err) + } + + fsTools, err := newRepoFS(repoDir, diff) + if err != nil { + return err + } + + mdl, err := ollama.Cloud(ollama.WithToken(apiKey)).Model(model) + if err != nil { + return fmt.Errorf("build model %q: %w", model, err) + } + + timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + // Pass 1 — review: produce the draft. + draft, err := runAgent(ctx, mdl, fsTools, string(systemBytes), buildTask(diff), + envInt("GADFLY_MAX_STEPS", defaultMaxSteps)) + if err != nil { + return fmt.Errorf("review pass: %w", err) + } + + // Pass 2 — recheck: adversarially re-verify the draft's findings and drop + // the unconfirmed ones. Skipped for a clean draft (nothing to verify) or + // when disabled. A recheck failure is non-fatal — we emit the unverified + // draft rather than losing the review entirely. + final := draft + if shouldRecheck(draft) { + rechecked, rerr := runAgent(ctx, mdl, fsTools, recheckSystemPrompt, buildRecheckTask(draft, diff), + envInt("GADFLY_RECHECK_MAX_STEPS", defaultRecheckMaxSteps)) + if rerr != nil { + fmt.Fprintln(os.Stderr, "gadfly: recheck pass failed; emitting unverified draft:", rerr) + } else { + final = rechecked + } + } + + fmt.Println(final) + return nil +} + +// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and +// returns the final answer. An empty answer is an error — the caller decides +// whether that is fatal (review pass) or recoverable (recheck pass). A +// non-empty answer that ended on a budget/guard error is still returned: the +// model wrote its output, then ran out of steps. +// +// Two mechanisms keep a step-hungry model from hard-failing with no output: +// 1. A wrap-up steer: once the run comes within wrapUpReserve steps of the +// cap, a forceful "stop calling tools, write your final answer" message is +// injected so the model spends its remaining steps finalizing. +// 2. A finalization fallback: if the loop still ends empty (the model ignored +// the nudge, or a loop guard tripped), one tool-free model call forces a +// final answer out of the transcript already gathered. +func runAgent(ctx context.Context, mdl llm.Model, fsTools *repoFS, system, task string, maxSteps int) (string, error) { + box, err := fsTools.toolbox() + if err != nil { + return "", err + } + loop := agent.New(mdl, system, + agent.WithToolbox(box), + agent.WithMaxSteps(maxSteps), + // Guard rails: stop the model from spinning on failing or identical + // tool calls instead of writing its answer. + agent.WithToolErrorLimits(4, 4), + ) + + wrapUpAt := maxSteps - wrapUpReserve() + if wrapUpAt < 1 { + wrapUpAt = 1 + } + var completed int // steps finished so far (updated after each step) + nudged := false + + res, runErr := loop.Run(ctx, task, + agent.OnStep(func(s agent.Step) { completed = s.Index + 1 }), + agent.WithSteer(func() []llm.Message { + if !nudged && completed >= wrapUpAt { + nudged = true + return []llm.Message{llm.UserText(wrapUpInstruction)} + } + return nil + }), + ) + + out := "" + if res != nil { + out = strings.TrimSpace(res.Output) + } + if out != "" { + return out, nil + } + + // No final answer. If we still have budget on the clock and a transcript to + // work from, force a tool-free finalization rather than losing the pass. + if res != nil && len(res.Messages) > 0 && ctx.Err() == nil { + if forced := forceFinalAnswer(ctx, mdl, system, res.Messages); forced != "" { + return forced, nil + } + } + + if runErr != nil { + return "", runErr + } + return "", errors.New("agent produced no output") +} + +// forceFinalAnswer makes one tool-free model call to squeeze a final answer out +// of an agent that exhausted its step budget without producing one. Tools are +// forbidden (ToolChoice "none") so the model must synthesize from the transcript +// instead of investigating further. Best-effort: any error or empty reply +// returns "" and the caller falls back to its normal empty-output handling. +func forceFinalAnswer(ctx context.Context, mdl llm.Model, system string, transcript []llm.Message) string { + msgs := append(append([]llm.Message(nil), transcript...), llm.UserText(finalizeInstruction)) + resp, err := mdl.Generate(ctx, llm.Request{ + System: system, + Messages: msgs, + ToolChoice: "none", + }) + if err != nil || resp == nil { + return "" + } + return strings.TrimSpace(resp.Text()) +} + +// wrapUpReserve is how many steps before the cap the wrap-up nudge fires, +// overridable via GADFLY_WRAPUP_RESERVE. +func wrapUpReserve() int { + return envInt("GADFLY_WRAPUP_RESERVE", defaultWrapUpReserve) +} + +// buildTask assembles the user message: PR metadata plus the unified diff, +// truncated for the prompt (the full diff stays available via get_diff). +func buildTask(diff string) string { + title := os.Getenv("GADFLY_TITLE") + body := os.Getenv("GADFLY_BODY") + + maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars) + truncNote := "" + if maxDiff > 0 && len(diff) > maxDiff { + diff = diff[:maxDiff] + truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars in this message; call get_diff for the full text.]", maxDiff) + } + + var b strings.Builder + if title != "" { + fmt.Fprintf(&b, "PR title: %s\n\n", title) + } + if strings.TrimSpace(body) != "" { + fmt.Fprintf(&b, "PR description:\n%s\n\n", body) + } + b.WriteString("Review the following unified diff. Before reporting any cross-file or compile-correctness issue, use your tools (read_file, grep, find_files) to verify it against the actual checked-out code — do not rely on the diff alone.\n\n") + fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote) + return b.String() +} + +// envInt reads an integer env var, falling back to def when unset or unparseable. +func envInt(name string, def int) int { + v := strings.TrimSpace(os.Getenv(name)) + if v == "" { + return def + } + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + return def + } + return n +} diff --git a/cmd/gadfly/recheck.go b/cmd/gadfly/recheck.go new file mode 100644 index 0000000..1043914 --- /dev/null +++ b/cmd/gadfly/recheck.go @@ -0,0 +1,97 @@ +package main + +import ( + "fmt" + "os" + "strings" +) + +// defaultRecheckMaxSteps bounds the verification pass. It is smaller than the +// review pass: re-checking a handful of existing findings needs fewer steps +// than discovering them. +const defaultRecheckMaxSteps = 16 + +// recheckSystemPrompt drives the second, adversarial verification pass. The +// model is given a DRAFT review and must independently confirm each finding +// against the real code before letting it survive — the antidote to a +// single-pass reviewer that reads a couple of files, mis-connects them, and +// posts a confident but wrong "blocking" verdict. +const recheckSystemPrompt = `You are a VERIFICATION GATE for an automated adversarial code review of the +"mort" project (a large Go Discord bot). You are given a DRAFT review produced +by another model. Your job is NOT to write a new review — it is to confirm or +reject each finding in the draft against the ACTUAL code, then output the +corrected review. + +You have the same read-only repository tools as the original reviewer: +- read_file(path[, start_line, limit]), list_dir([path]), grep(pattern[, path, + max_results]), find_files(name[, max_results]), get_diff(). + +For EVERY finding in the draft: +1. Independently reproduce the reasoning by reading the actual files with your + tools — do not trust the draft's claim, and do not trust the diff hunk alone. +2. KEEP the finding only if you can positively confirm it against the code. +3. DROP the finding if you cannot confirm it, or if the code contradicts it. + +Watch especially for findings that ignore the "glue" around a change — the most +common false positive. Before keeping a claim that something is "missing", +"undefined", "never set", "not exported", or "won't compile", GREP THE WHOLE +REPO for it: the thing is very often satisfied in a place the original reviewer +didn't look — a shell script or Makefile that sets an env var, a CI YAML, an +adjacent file, generated code, or a wrapper that maps one name to another. A +finding that an env var X is unset is wrong if any script invokes the program +with "X=... prog". Check before you keep. + +Output rules: +- Output the corrected review in the SAME format as the draft: a one-line + VERDICT ("No material issues found", "Minor issues", or "Blocking issues + found"), then the surviving findings as bullets with path:line and impact. +- Recompute the VERDICT from what SURVIVES. If every finding was dropped, the + verdict is "No material issues found". +- Do NOT invent new findings; this is a verification gate, not a fresh review. +- Do NOT include meta-commentary about the verification process or which + findings you dropped — output only the final, corrected review markdown. +- When done investigating, STOP calling tools and reply with the review.` + +// recheckEnabled reports whether the verification pass should run. On unless +// GADFLY_RECHECK is explicitly a falsey value. +func recheckEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv("GADFLY_RECHECK"))) { + case "0", "false", "no", "off": + return false + default: + return true + } +} + +// shouldRecheck decides whether to run the verification pass for a given draft. +// A clean "no material issues" draft has nothing to verify, so it is skipped +// even when rechecking is enabled — saving a whole model pass on clean PRs. +func shouldRecheck(draft string) bool { + if !recheckEnabled() { + return false + } + if strings.Contains(strings.ToLower(draft), "no material issues") { + return false + } + return true +} + +// buildRecheckTask is the verification pass's user message: the draft review to +// scrutinize, with the full diff available via get_diff (and embedded here, +// truncated, to save a tool call). +func buildRecheckTask(draft, diff string) string { + maxDiff := envInt("GADFLY_MAX_DIFF_CHARS", defaultMaxDiffChars) + truncNote := "" + if maxDiff > 0 && len(diff) > maxDiff { + diff = diff[:maxDiff] + truncNote = fmt.Sprintf("\n\n[NOTE: diff truncated to %d chars here; call get_diff for the full text.]", maxDiff) + } + + var b strings.Builder + b.WriteString("Verify the following DRAFT review against the actual code, drop every finding you cannot confirm, and output the corrected review.\n\n") + b.WriteString("## Draft review\n\n") + b.WriteString(draft) + b.WriteString("\n\n## PR diff under review\n\n") + fmt.Fprintf(&b, "```diff\n%s\n```%s", diff, truncNote) + return b.String() +} diff --git a/cmd/gadfly/recheck_test.go b/cmd/gadfly/recheck_test.go new file mode 100644 index 0000000..877c393 --- /dev/null +++ b/cmd/gadfly/recheck_test.go @@ -0,0 +1,101 @@ +package main + +import ( + "context" + "strings" + "testing" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" +) + +func TestShouldRecheck(t *testing.T) { + t.Setenv("GADFLY_RECHECK", "") // default on + + if shouldRecheck("VERDICT: Blocking issues found\n- something is wrong") != true { + t.Error("a draft with findings should be rechecked") + } + if shouldRecheck("No material issues found.") != false { + t.Error("a clean draft should skip recheck") + } + if shouldRecheck("### review\n\nNo material issues found.\n") != false { + t.Error("clean draft detection should be case/whitespace tolerant") + } + + // Explicit disable wins even when there are findings. + t.Setenv("GADFLY_RECHECK", "0") + if shouldRecheck("Blocking issues found\n- x") != false { + t.Error("GADFLY_RECHECK=0 must disable recheck") + } + t.Setenv("GADFLY_RECHECK", "false") + if shouldRecheck("Blocking issues found\n- x") != false { + t.Error("GADFLY_RECHECK=false must disable recheck") + } +} + +func TestRecheckEnabled(t *testing.T) { + for _, v := range []string{"", "1", "true", "yes", "anything"} { + t.Setenv("GADFLY_RECHECK", v) + if !recheckEnabled() { + t.Errorf("GADFLY_RECHECK=%q should be enabled", v) + } + } + for _, v := range []string{"0", "false", "no", "off", "OFF", " False "} { + t.Setenv("GADFLY_RECHECK", v) + if recheckEnabled() { + t.Errorf("GADFLY_RECHECK=%q should be disabled", v) + } + } +} + +func TestBuildRecheckTask(t *testing.T) { + t.Setenv("GADFLY_MAX_DIFF_CHARS", "") + draft := "VERDICT: Blocking issues found\n- foo.go:1 broken" + out := buildRecheckTask(draft, "diff --git a/x b/x\n+y\n") + if !strings.Contains(out, draft) { + t.Error("recheck task must include the draft review") + } + if !strings.Contains(out, "Verify") || !strings.Contains(out, "drop every finding you cannot confirm") { + t.Errorf("recheck task missing the verify instruction:\n%s", out) + } + if !strings.Contains(out, "diff --git") { + t.Error("recheck task should include the diff") + } +} + +// fakeModel builds a fake majordomo model that always replies with the given +// text (no tool calls), so the agent loop ends on its first step. +func fakeModel(t *testing.T, reply string) llm.Model { + t.Helper() + p := fake.New("fake", fake.WithDefault(func(string, llm.Request) fake.Step { + return fake.Reply(reply) + })) + m, err := p.Model("mock") + if err != nil { + t.Fatal(err) + } + return m +} + +func TestRunAgent_ReturnsOutput(t *testing.T) { + fs, err := newRepoFS(t.TempDir(), "diff") + if err != nil { + t.Fatal(err) + } + mdl := fakeModel(t, " corrected review: No material issues found. ") + out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4) + if err != nil { + t.Fatalf("runAgent: %v", err) + } + if out != "corrected review: No material issues found." { + t.Errorf("runAgent should return trimmed model output, got %q", out) + } +} + +func TestRunAgent_EmptyIsError(t *testing.T) { + fs, _ := newRepoFS(t.TempDir(), "diff") + mdl := fakeModel(t, " ") + if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil { + t.Error("runAgent should error on empty model output") + } +} diff --git a/cmd/gadfly/tools.go b/cmd/gadfly/tools.go new file mode 100644 index 0000000..8d4e3ea --- /dev/null +++ b/cmd/gadfly/tools.go @@ -0,0 +1,388 @@ +package main + +import ( + "bufio" + "context" + "fmt" + "os" + "path/filepath" + "regexp" + "sort" + "strings" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" +) + +// Tool output bounds. The reviewer is a chat agent with a finite context, so +// every tool caps how much it can pull in one call — a runaway read_file or +// grep would blow the window and stall the loop. +const ( + maxFileBytes = 64 * 1024 // per read_file call + maxReadLines = 800 // per read_file call + maxGrepResults = 200 // per grep call + maxFindResults = 200 // per find_files call + maxLineLen = 400 // truncate any single returned line to this +) + +// skipDirs are never descended into by grep / find_files — noise and bulk that +// a code reviewer never needs and that would swamp the results. +var skipDirs = map[string]bool{ + ".git": true, + "node_modules": true, + "vendor": true, +} + +// repoFS is a read-only, sandboxed view of the checked-out repository. Every +// path argument from the model is resolved against root and rejected if it +// escapes (symlink or `..` traversal), so a hostile diff can never make the +// reviewer read outside the checkout. +type repoFS struct { + root string // absolute, symlink-resolved repo root + diff string // the full PR unified diff (served by get_diff) +} + +// newRepoFS resolves root to an absolute, symlink-free path. +func newRepoFS(root, diff string) (*repoFS, error) { + abs, err := filepath.Abs(root) + if err != nil { + return nil, fmt.Errorf("resolve repo dir: %w", err) + } + // EvalSymlinks so prefix containment checks survive a symlinked root + // (e.g. macOS /tmp -> /private/tmp). + if resolved, err := filepath.EvalSymlinks(abs); err == nil { + abs = resolved + } + info, err := os.Stat(abs) + if err != nil { + return nil, fmt.Errorf("repo dir %q: %w", root, err) + } + if !info.IsDir() { + return nil, fmt.Errorf("repo dir %q is not a directory", root) + } + return &repoFS{root: abs, diff: diff}, nil +} + +// resolve maps a model-supplied relative path to an absolute path inside the +// sandbox, rejecting anything that escapes root. An empty path means root. +func (r *repoFS) resolve(rel string) (string, error) { + rel = strings.TrimSpace(rel) + rel = strings.TrimPrefix(rel, "./") + if rel == "" || rel == "." { + return r.root, nil + } + if filepath.IsAbs(rel) { + // Allow an absolute path only if it already points inside the sandbox. + clean := filepath.Clean(rel) + if err := r.contains(clean); err != nil { + return "", err + } + return clean, nil + } + joined := filepath.Clean(filepath.Join(r.root, rel)) + if err := r.contains(joined); err != nil { + return "", err + } + return joined, nil +} + +// contains verifies abs is root or lives beneath it. +func (r *repoFS) contains(abs string) error { + if abs == r.root { + return nil + } + if !strings.HasPrefix(abs, r.root+string(os.PathSeparator)) { + return fmt.Errorf("path escapes the repository sandbox") + } + return nil +} + +// toolbox builds the read-only review toolbox over this sandbox. +func (r *repoFS) toolbox() (*llm.Toolbox, error) { + box := llm.NewToolbox("gadfly") + tools := []llm.Tool{ + r.readFileTool(), + r.listDirTool(), + r.grepTool(), + r.findFilesTool(), + r.getDiffTool(), + } + for _, t := range tools { + if err := box.Add(t); err != nil { + return nil, fmt.Errorf("add tool %q: %w", t.Name, err) + } + } + return box, nil +} + +type readFileArgs struct { + Path string `json:"path" description:"Repository-relative path of the file to read, e.g. pkg/logic/agentexec/pipeline.go"` + StartLine int `json:"start_line,omitempty" description:"Optional 1-based line to start from (default 1)."` + Limit int `json:"limit,omitempty" description:"Optional max number of lines to return (default/maximum 800)."` +} + +func (r *repoFS) readFileTool() llm.Tool { + return llm.DefineTool[readFileArgs]( + "read_file", + "Read a file from the repository at its current checked-out state, with line numbers. Use this to verify the surrounding code, imports, and symbols a diff hunk touches before reporting an issue.", + func(_ context.Context, args readFileArgs) (any, error) { + abs, err := r.resolve(args.Path) + if err != nil { + return nil, err + } + info, err := os.Stat(abs) + if err != nil { + return nil, fmt.Errorf("stat %q: %w", args.Path, err) + } + if info.IsDir() { + return nil, fmt.Errorf("%q is a directory; use list_dir", args.Path) + } + f, err := os.Open(abs) + if err != nil { + return nil, fmt.Errorf("open %q: %w", args.Path, err) + } + defer f.Close() + + start := args.StartLine + if start < 1 { + start = 1 + } + limit := args.Limit + if limit <= 0 || limit > maxReadLines { + limit = maxReadLines + } + + var b strings.Builder + sc := bufio.NewScanner(f) + sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) + lineNo := 0 + emitted := 0 + for sc.Scan() { + lineNo++ + if lineNo < start { + continue + } + if emitted >= limit || b.Len() >= maxFileBytes { + fmt.Fprintf(&b, "... (truncated at line %d; call read_file again with start_line=%d for more)\n", lineNo, lineNo) + break + } + line := sc.Text() + if len(line) > maxLineLen { + line = line[:maxLineLen] + "…" + } + fmt.Fprintf(&b, "%d\t%s\n", lineNo, line) + emitted++ + } + if err := sc.Err(); err != nil { + return nil, fmt.Errorf("read %q: %w", args.Path, err) + } + if emitted == 0 { + return fmt.Sprintf("(%s has no lines at/after %d; file has %d lines)", args.Path, start, lineNo), nil + } + return b.String(), nil + }, + ) +} + +type listDirArgs struct { + Path string `json:"path,omitempty" description:"Optional repository-relative directory (default: repo root)."` +} + +func (r *repoFS) listDirTool() llm.Tool { + return llm.DefineTool[listDirArgs]( + "list_dir", + "List the entries of a directory in the repository (directories marked with a trailing /). Use it to discover where code lives before reading.", + func(_ context.Context, args listDirArgs) (any, error) { + abs, err := r.resolve(args.Path) + if err != nil { + return nil, err + } + entries, err := os.ReadDir(abs) + if err != nil { + return nil, fmt.Errorf("list %q: %w", args.Path, err) + } + names := make([]string, 0, len(entries)) + for _, e := range entries { + name := e.Name() + if e.IsDir() { + name += "/" + } + names = append(names, name) + } + sort.Strings(names) + if len(names) == 0 { + return "(empty directory)", nil + } + return strings.Join(names, "\n"), nil + }, + ) +} + +type grepArgs struct { + Pattern string `json:"pattern" description:"A Go (RE2) regular expression to search for."` + Path string `json:"path,omitempty" description:"Optional repository-relative file or subdirectory to scope the search (default: whole repo)."` + MaxResults int `json:"max_results,omitempty" description:"Optional cap on matching lines returned (default/maximum 200)."` +} + +func (r *repoFS) grepTool() llm.Tool { + return llm.DefineTool[grepArgs]( + "grep", + "Search the repository's text files for a regular expression and return matching `path:line: text`. Use it to check whether a symbol, import, or call exists elsewhere before claiming a cross-file problem.", + func(_ context.Context, args grepArgs) (any, error) { + if strings.TrimSpace(args.Pattern) == "" { + return nil, fmt.Errorf("pattern is required") + } + re, err := regexp.Compile(args.Pattern) + if err != nil { + return nil, fmt.Errorf("invalid regexp: %w", err) + } + base, err := r.resolve(args.Path) + if err != nil { + return nil, err + } + limit := args.MaxResults + if limit <= 0 || limit > maxGrepResults { + limit = maxGrepResults + } + + var out []string + truncated := false + walkErr := filepath.WalkDir(base, func(path string, d os.DirEntry, err error) error { + if err != nil { + return nil // skip unreadable entries + } + if d.IsDir() { + if skipDirs[d.Name()] && path != base { + return filepath.SkipDir + } + return nil + } + if len(out) >= limit { + truncated = true + return filepath.SkipAll + } + matchesInFile(path, r.root, re, limit, &out) + return nil + }) + if walkErr != nil { + return nil, fmt.Errorf("search: %w", walkErr) + } + if len(out) > limit { + out = out[:limit] + truncated = true + } + if len(out) == 0 { + return "(no matches)", nil + } + res := strings.Join(out, "\n") + if truncated { + res += fmt.Sprintf("\n... (truncated at %d matches; narrow the pattern or path)", limit) + } + return res, nil + }, + ) +} + +// matchesInFile appends "relpath:line: text" for each regexp match in a single +// text file, stopping once the global cap is reached. Binary files (NUL in the +// first chunk) and oversized files are skipped. +func matchesInFile(path, root string, re *regexp.Regexp, limit int, out *[]string) { + f, err := os.Open(path) + if err != nil { + return + } + defer f.Close() + + rel, relErr := filepath.Rel(root, path) + if relErr != nil { + rel = path + } + sc := bufio.NewScanner(f) + sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) + lineNo := 0 + for sc.Scan() { + if len(*out) >= limit { + return + } + lineNo++ + line := sc.Text() + if lineNo == 1 && strings.IndexByte(line, 0) >= 0 { + return // looks binary + } + if re.MatchString(line) { + trimmed := strings.TrimSpace(line) + if len(trimmed) > maxLineLen { + trimmed = trimmed[:maxLineLen] + "…" + } + *out = append(*out, fmt.Sprintf("%s:%d: %s", rel, lineNo, trimmed)) + } + } +} + +type findFilesArgs struct { + Name string `json:"name" description:"Case-insensitive substring of the file path to match, e.g. \"pipeline.go\" or \"agentexec/\"."` + MaxResults int `json:"max_results,omitempty" description:"Optional cap on paths returned (default/maximum 200)."` +} + +func (r *repoFS) findFilesTool() llm.Tool { + return llm.DefineTool[findFilesArgs]( + "find_files", + "Find files whose repository-relative path contains a case-insensitive substring. Use it to locate a file by name when you don't know its directory.", + func(_ context.Context, args findFilesArgs) (any, error) { + needle := strings.ToLower(strings.TrimSpace(args.Name)) + if needle == "" { + return nil, fmt.Errorf("name is required") + } + limit := args.MaxResults + if limit <= 0 || limit > maxFindResults { + limit = maxFindResults + } + var out []string + truncated := false + _ = filepath.WalkDir(r.root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return nil + } + if d.IsDir() { + if skipDirs[d.Name()] && path != r.root { + return filepath.SkipDir + } + return nil + } + if len(out) >= limit { + truncated = true + return filepath.SkipAll + } + rel, relErr := filepath.Rel(r.root, path) + if relErr != nil { + return nil + } + if strings.Contains(strings.ToLower(rel), needle) { + out = append(out, rel) + } + return nil + }) + sort.Strings(out) + if len(out) == 0 { + return "(no files matched)", nil + } + res := strings.Join(out, "\n") + if truncated { + res += fmt.Sprintf("\n... (truncated at %d files; narrow the name)", limit) + } + return res, nil + }, + ) +} + +func (r *repoFS) getDiffTool() llm.Tool { + return llm.DefineTool[struct{}]( + "get_diff", + "Return the complete unified diff under review. The diff is also included (possibly truncated) in the task message; call this to get the full, untruncated text.", + func(_ context.Context, _ struct{}) (any, error) { + if strings.TrimSpace(r.diff) == "" { + return "(empty diff)", nil + } + return r.diff, nil + }, + ) +} diff --git a/cmd/gadfly/tools_test.go b/cmd/gadfly/tools_test.go new file mode 100644 index 0000000..3c2a2c4 --- /dev/null +++ b/cmd/gadfly/tools_test.go @@ -0,0 +1,243 @@ +package main + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" +) + +// buildFixtureRepo lays down a small repo tree for the toolbox tests and +// returns its root. +func buildFixtureRepo(t *testing.T) string { + t.Helper() + root := t.TempDir() + write := func(rel, content string) { + p := filepath.Join(root, rel) + if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(p, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + } + write("pkg/foo/foo.go", "package foo\n\nfunc Hello() string {\n\treturn \"hi\"\n}\n") + write("pkg/foo/bar.go", "package foo\n\n// TODO: refactor\nvar Answer = 42\n") + write("README.md", "# Fixture\n\nHello world.\n") + write(".git/config", "[core]\n\tbare = false\n") // must be skipped by grep/find + write("secret.txt", "this file lives at the repo root\n") + return root +} + +// call invokes a tool from the sandbox's toolbox by name with JSON args and +// returns the result string (or the error). +func call(t *testing.T, fs *repoFS, name string, args map[string]any) (string, error) { + t.Helper() + box, err := fs.toolbox() + if err != nil { + t.Fatalf("toolbox: %v", err) + } + tool, ok := box.Get(name) + if !ok { + t.Fatalf("tool %q not in toolbox", name) + } + raw, err := json.Marshal(args) + if err != nil { + t.Fatal(err) + } + out, herr := tool.Handler(context.Background(), raw) + if herr != nil { + return "", herr + } + s, _ := out.(string) + return s, nil +} + +func TestRepoFS_ResolveSandbox(t *testing.T) { + root := buildFixtureRepo(t) + fs, err := newRepoFS(root, "") + if err != nil { + t.Fatalf("newRepoFS: %v", err) + } + + // In-bounds paths resolve. + if _, err := fs.resolve("pkg/foo/foo.go"); err != nil { + t.Errorf("in-bounds path rejected: %v", err) + } + if got, err := fs.resolve(""); err != nil || got != fs.root { + t.Errorf("empty path should be root: got %q err %v", got, err) + } + + // Escapes are rejected. + for _, bad := range []string{"../outside", "../../etc/passwd", "pkg/../../escape", "/etc/passwd"} { + if _, err := fs.resolve(bad); err == nil { + t.Errorf("path %q escaped the sandbox but was allowed", bad) + } + } +} + +func TestReadFileTool(t *testing.T) { + root := buildFixtureRepo(t) + fs, _ := newRepoFS(root, "") + + out, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go"}) + if err != nil { + t.Fatalf("read_file: %v", err) + } + if !strings.Contains(out, "func Hello()") { + t.Errorf("expected file body, got:\n%s", out) + } + if !strings.Contains(out, "1\t") { + t.Errorf("expected line numbers, got:\n%s", out) + } + + // Line slicing. + out, err = call(t, fs, "read_file", map[string]any{"path": "pkg/foo/foo.go", "start_line": 3, "limit": 1}) + if err != nil { + t.Fatalf("read_file slice: %v", err) + } + if !strings.Contains(out, "func Hello()") || strings.Contains(out, "package foo") { + t.Errorf("slice should start at line 3 only, got:\n%s", out) + } + + // Reading a directory is an error directing to list_dir. + if _, err := call(t, fs, "read_file", map[string]any{"path": "pkg/foo"}); err == nil { + t.Error("reading a directory should error") + } + + // Escape is rejected. + if _, err := call(t, fs, "read_file", map[string]any{"path": "../escape"}); err == nil { + t.Error("read_file should reject sandbox escape") + } +} + +func TestListDirTool(t *testing.T) { + root := buildFixtureRepo(t) + fs, _ := newRepoFS(root, "") + + out, err := call(t, fs, "list_dir", map[string]any{"path": "pkg/foo"}) + if err != nil { + t.Fatalf("list_dir: %v", err) + } + for _, want := range []string{"foo.go", "bar.go"} { + if !strings.Contains(out, want) { + t.Errorf("list_dir missing %q in:\n%s", want, out) + } + } + + // Root listing marks directories with a trailing slash. + out, _ = call(t, fs, "list_dir", map[string]any{}) + if !strings.Contains(out, "pkg/") { + t.Errorf("expected pkg/ (dir with trailing slash) in root listing:\n%s", out) + } +} + +func TestGrepTool(t *testing.T) { + root := buildFixtureRepo(t) + fs, _ := newRepoFS(root, "") + + out, err := call(t, fs, "grep", map[string]any{"pattern": "func Hello"}) + if err != nil { + t.Fatalf("grep: %v", err) + } + if !strings.Contains(out, "pkg/foo/foo.go:") { + t.Errorf("grep should locate the func, got:\n%s", out) + } + + // .git is skipped. + out, _ = call(t, fs, "grep", map[string]any{"pattern": "bare = false"}) + if strings.Contains(out, ".git/") { + t.Errorf("grep must not descend into .git, got:\n%s", out) + } + + // No matches is a clean message, not an error. + out, err = call(t, fs, "grep", map[string]any{"pattern": "zzz_no_such_token_zzz"}) + if err != nil || !strings.Contains(out, "no matches") { + t.Errorf("expected clean no-match, got %q err %v", out, err) + } + + // Invalid regexp surfaces as an error. + if _, err := call(t, fs, "grep", map[string]any{"pattern": "([unterminated"}); err == nil { + t.Error("invalid regexp should error") + } + + // Scoped grep honors the path. + out, _ = call(t, fs, "grep", map[string]any{"pattern": "Answer", "path": "pkg/foo/bar.go"}) + if !strings.Contains(out, "bar.go:") { + t.Errorf("scoped grep missed the match:\n%s", out) + } +} + +func TestFindFilesTool(t *testing.T) { + root := buildFixtureRepo(t) + fs, _ := newRepoFS(root, "") + + out, err := call(t, fs, "find_files", map[string]any{"name": "foo.go"}) + if err != nil { + t.Fatalf("find_files: %v", err) + } + if !strings.Contains(out, "pkg/foo/foo.go") { + t.Errorf("find_files missed foo.go:\n%s", out) + } + + // Case-insensitive substring on the path. + out, _ = call(t, fs, "find_files", map[string]any{"name": "PKG/FOO"}) + if !strings.Contains(out, "pkg/foo/") { + t.Errorf("find_files should be case-insensitive on the path:\n%s", out) + } + + // .git entries are not surfaced. + out, _ = call(t, fs, "find_files", map[string]any{"name": "config"}) + if strings.Contains(out, ".git/") { + t.Errorf("find_files must skip .git, got:\n%s", out) + } +} + +func TestGetDiffTool(t *testing.T) { + root := buildFixtureRepo(t) + const diff = "diff --git a/x b/x\n+added line\n" + fs, _ := newRepoFS(root, diff) + + out, err := call(t, fs, "get_diff", map[string]any{}) + if err != nil { + t.Fatalf("get_diff: %v", err) + } + if out != diff { + t.Errorf("get_diff returned %q, want %q", out, diff) + } +} + +func TestNewRepoFS_BadRoot(t *testing.T) { + // A file (not a directory) is rejected. + f := filepath.Join(t.TempDir(), "afile") + if err := os.WriteFile(f, []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + if _, err := newRepoFS(f, ""); err == nil { + t.Error("newRepoFS should reject a non-directory root") + } + if _, err := newRepoFS(filepath.Join(t.TempDir(), "missing"), ""); err == nil { + t.Error("newRepoFS should reject a missing root") + } +} + +// Ensure the toolbox exposes exactly the expected tools (guards against an +// accidental rename breaking the system prompt's tool references). +func TestToolbox_Names(t *testing.T) { + fs, _ := newRepoFS(t.TempDir(), "") + box, err := fs.toolbox() + if err != nil { + t.Fatalf("toolbox: %v", err) + } + got := map[string]bool{} + for _, tl := range box.Tools() { + got[tl.Name] = true + } + for _, want := range []string{"read_file", "list_dir", "grep", "find_files", "get_diff"} { + if !got[want] { + t.Errorf("toolbox missing tool %q", want) + } + } +} diff --git a/cmd/gadfly/wrapup_test.go b/cmd/gadfly/wrapup_test.go new file mode 100644 index 0000000..7ea1c0c --- /dev/null +++ b/cmd/gadfly/wrapup_test.go @@ -0,0 +1,143 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "testing" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/fake" +) + +// spinToolCall is a response that asks for the get_diff tool (which succeeds and +// ignores extra args), used to burn agent steps without producing a final +// answer. The args vary by n so successive calls are not byte-identical — that +// dodges the agent's same-call loop guard, exactly as a real reviewer making +// distinct tool calls would. +func spinToolCall(n int) fake.Step { + return fake.ReplyWith(llm.Response{ + ToolCalls: []llm.ToolCall{{ + ID: "call", + Name: "get_diff", + Arguments: json.RawMessage(fmt.Sprintf(`{"_n":%d}`, n)), + }}, + FinishReason: llm.FinishToolCalls, + Usage: llm.Usage{InputTokens: 1, OutputTokens: 1}, + }) +} + +// lastUserText returns the text of the final message in the request, which is +// what a fresh Generate call is reacting to. +func lastUserText(req llm.Request) string { + if len(req.Messages) == 0 { + return "" + } + return req.Messages[len(req.Messages)-1].Text() +} + +// TestRunAgent_WrapUpNudgeProducesAnswer: a model that keeps calling tools until +// it is nudged to wrap up should still finish inside its budget — the steer +// message arrives a few steps before the cap and the model writes its answer. +func TestRunAgent_WrapUpNudgeProducesAnswer(t *testing.T) { + t.Setenv("GADFLY_WRAPUP_RESERVE", "4") + + final := "VERDICT: No material issues found." + nudgeSeen := false + n := 0 + p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step { + if strings.Contains(lastUserText(req), "almost out of your investigation budget") { + nudgeSeen = true + return fake.Reply(final) + } + n++ + return spinToolCall(n) + })) + mdl, err := p.Model("mock") + if err != nil { + t.Fatal(err) + } + fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n") + + out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 12) + if err != nil { + t.Fatalf("runAgent should succeed via wrap-up nudge, got error: %v", err) + } + if out != final { + t.Errorf("expected final review %q, got %q", final, out) + } + if !nudgeSeen { + t.Error("the wrap-up nudge was never delivered to the model") + } +} + +// TestRunAgent_FinalizationFallback: a model that ignores the wrap-up nudge and +// spins on tools until the cap should NOT hard-fail — the tool-free finalization +// pass forces a final answer out of the transcript. +func TestRunAgent_FinalizationFallback(t *testing.T) { + t.Setenv("GADFLY_WRAPUP_RESERVE", "2") + + final := "VERDICT: Minor issues\n- something" + forcedCalled := false + n := 0 + p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step { + // Only the tool-free finalization pass forbids tools — reply there. + if req.ToolChoice == "none" { + forcedCalled = true + return fake.Reply(final) + } + // Otherwise keep spinning, ignoring the wrap-up nudge entirely. + n++ + return spinToolCall(n) + })) + mdl, err := p.Model("mock") + if err != nil { + t.Fatal(err) + } + fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n") + + out, err := runAgent(context.Background(), mdl, fs, "sys", "task", 6) + if err != nil { + t.Fatalf("runAgent should recover via finalization fallback, got error: %v", err) + } + if !forcedCalled { + t.Error("finalization fallback was never invoked") + } + if out != final { + t.Errorf("expected forced final answer %q, got %q", final, out) + } +} + +// TestRunAgent_FallbackStillEmptyIsError: if even the tool-free finalization +// yields nothing, runAgent surfaces an error rather than a phantom success. +func TestRunAgent_FallbackStillEmptyIsError(t *testing.T) { + n := 0 + p := fake.New("fake", fake.WithDefault(func(_ string, req llm.Request) fake.Step { + if req.ToolChoice == "none" { + return fake.Reply(" ") // finalization produces only whitespace + } + n++ + return spinToolCall(n) + })) + mdl, err := p.Model("mock") + if err != nil { + t.Fatal(err) + } + fs, _ := newRepoFS(t.TempDir(), "diff --git a/x b/x\n+y\n") + + if _, err := runAgent(context.Background(), mdl, fs, "sys", "task", 4); err == nil { + t.Error("runAgent should error when the finalization fallback also yields no output") + } +} + +func TestWrapUpReserve(t *testing.T) { + t.Setenv("GADFLY_WRAPUP_RESERVE", "") + if got := wrapUpReserve(); got != defaultWrapUpReserve { + t.Errorf("default wrap-up reserve = %d, want %d", got, defaultWrapUpReserve) + } + t.Setenv("GADFLY_WRAPUP_RESERVE", "7") + if got := wrapUpReserve(); got != 7 { + t.Errorf("wrap-up reserve override = %d, want 7", got) + } +} diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..7b12ee0 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# Gadfly container entrypoint. +# +# This is the brains that used to live in the Gitea Actions workflow YAML. A +# consuming repo only commits a ~15-line stub workflow that runs this image and +# passes the event context as env; ALL the gating, cloning, model-looping and +# comment I/O happens here, so the stub stays dumb (act_runner has weak YAML +# expression support — keep logic in the image, not the workflow). +# +# What it does: +# 1. Decides whether this event should trigger a review (draft skip, comment +# trigger phrase + allowed-user gate, PR detection). Non-triggers exit 0. +# 2. Acknowledges a comment trigger with a 👀 reaction. +# 3. Shallow-clones the PR's head branch (the agentic reviewer reads the +# checked-out tree to VERIFY findings, not just the diff). +# 4. Runs the gadfly reviewer once per configured model via run.sh, which +# upserts one labeled PR comment per model. +# +# Advisory only: it never blocks a merge. Config/usage errors exit non-zero; +# everything review-related is posted as a comment, never a failed check. +# +# Env (set by the consumer's stub workflow from the github.* context): +# GITEA_API https://HOST/api/v1/repos/OWNER/REPO (required) +# GITEA_TOKEN built-in Actions token (posts comments) (required) +# OLLAMA_CLOUD_API_KEY Ollama Cloud key; empty => "not configured" notice +# EVENT_NAME pull_request | issue_comment | workflow_dispatch (required) +# PR pull request number (required) +# PR_BRANCH head branch (github.head_ref); empty => fetched from API +# IS_DRAFT 'true' on a draft PR => skipped +# COMMENT_BODY comment text (issue_comment only) +# COMMENT_ID comment id, for the 👀 reaction (issue_comment only) +# ACTOR github.actor (the user who triggered) +# Optional config: +# OLLAMA_REVIEW_MODELS comma-separated model ids (default below) +# GADFLY_TRIGGER_PHRASE comment phrase that triggers a re-review (default "@gadfly review") +# GADFLY_ALLOWED_USERS comma-separated usernames allowed to comment-trigger; +# empty => fall back to "is a repo collaborator" +set -uo pipefail + +DEFAULT_MODELS="qwen3-coder:480b-cloud,gpt-oss:120b-cloud" +TRIGGER_PHRASE="${GADFLY_TRIGGER_PHRASE:-@gadfly review}" +SCRIPTS_DIR="/app/scripts" +WORKDIR="${WORKDIR:-/tmp/gadfly}" + +log() { echo "[gadfly] $*" >&2; } +die() { log "ERROR: $*"; exit 1; } + +: "${GITEA_API:?GITEA_API required}" +: "${GITEA_TOKEN:?GITEA_TOKEN required}" +: "${PR:?PR required}" +: "${EVENT_NAME:?EVENT_NAME required}" + +API() { curl -fsS --connect-timeout 20 --max-time 30 -H "Authorization: token ${GITEA_TOKEN}" "$@"; } + +# --- is the commenter allowed to trigger a re-review? ---------------------- +actor_allowed() { + local actor="$1" + [ -z "$actor" ] && return 1 + if [ -n "${GADFLY_ALLOWED_USERS:-}" ]; then + local IFS=',' + for u in $GADFLY_ALLOWED_USERS; do + [ "$(echo "$u" | tr -d '[:space:]')" = "$actor" ] && return 0 + done + return 1 + fi + # No explicit allow-list: allow anyone with collaborator (write) access. + local code + code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 20 --max-time 30 \ + -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_API}/collaborators/${actor}")" + [ "$code" = "204" ] +} + +# --- trigger gating -------------------------------------------------------- +case "$EVENT_NAME" in + workflow_dispatch) + log "manual dispatch for PR #${PR}" ;; + pull_request) + if [ "${IS_DRAFT:-false}" = "true" ]; then + log "PR #${PR} is a draft; skipping"; exit 0 + fi + log "new/updated PR #${PR}" ;; + issue_comment) + case "${COMMENT_BODY:-}" in + *"$TRIGGER_PHRASE"*) : ;; + *) log "comment does not contain trigger phrase ${TRIGGER_PHRASE}; skipping"; exit 0 ;; + esac + if ! actor_allowed "${ACTOR:-}"; then + log "actor '${ACTOR:-}' not allowed to trigger; skipping"; exit 0 + fi + # Must be a comment on a PR, not a plain issue. + if ! API "${GITEA_API}/pulls/${PR}" >/dev/null 2>&1; then + log "issue #${PR} is not a pull request; skipping"; exit 0 + fi + # Acknowledge with 👀. + if [ -n "${COMMENT_ID:-}" ]; then + curl -s -X POST -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/comments/${COMMENT_ID}/reactions" -d '{"content":"eyes"}' >/dev/null 2>&1 || true + fi + log "comment-triggered review for PR #${PR} by ${ACTOR:-?}" ;; + *) + log "event '${EVENT_NAME}' not handled; skipping"; exit 0 ;; +esac + +# --- resolve head branch --------------------------------------------------- +BRANCH="${PR_BRANCH:-}" +if [ -z "$BRANCH" ]; then + BRANCH="$(API "${GITEA_API}/pulls/${PR}" | jq -r '.head.ref // ""')" +fi +[ -z "$BRANCH" ] && die "could not determine PR #${PR} head branch" + +# --- clone the PR's checked-out tree (shallow) ----------------------------- +HOST="${GITEA_API%%/api/v1/*}" # https://host +REPO_PATH="${GITEA_API##*/api/v1/repos/}" # owner/repo +CLONE_URL="https://token:${GITEA_TOKEN}@${HOST#https://}/${REPO_PATH}.git" +REPO_DIR="${WORKDIR}/repo" +rm -rf "$REPO_DIR"; mkdir -p "$WORKDIR" +log "cloning ${REPO_PATH} @ ${BRANCH}" +git clone --depth=1 --branch "$BRANCH" "$CLONE_URL" "$REPO_DIR" 2>/dev/null \ + || die "clone of ${REPO_PATH}@${BRANCH} failed" + +# --- review once per model ------------------------------------------------- +MODELS="${OLLAMA_REVIEW_MODELS:-$DEFAULT_MODELS}" +log "models: ${MODELS}" +IFS=',' read -ra ARR <<< "$MODELS" || true +for raw in "${ARR[@]}"; do + m="$(echo "$raw" | tr -d '[:space:]')" + [ -z "$m" ] && continue + log "::: reviewing with ${m}" + PROVIDER=ollama \ + MODEL="$m" \ + GADFLY_BIN="/usr/local/bin/gadfly" \ + GADFLY_REPO_DIR="$REPO_DIR" \ + bash "${SCRIPTS_DIR}/run.sh" || log "model ${m} failed (continuing)" +done +log "done" diff --git a/examples/adversarial-review.yml b/examples/adversarial-review.yml new file mode 100644 index 0000000..aaccb70 --- /dev/null +++ b/examples/adversarial-review.yml @@ -0,0 +1,52 @@ +# Drop this in ANY Gitea repo at .gitea/workflows/adversarial-review.yml to turn +# Gadfly on. The image holds all the logic; this stub just forwards the event +# context. Advisory only — it never blocks a merge. +# +# Per-repo setup (no code changes needed): +# secret OLLAMA_CLOUD_API_KEY your Ollama Cloud key +# var OLLAMA_REVIEW_MODELS (optional) comma-separated model ids +# var GADFLY_ALLOWED_USERS (optional) who may "@gadfly review"; empty = +# any repo collaborator +# GITEA_TOKEN is provided automatically; comments post as the gitea-actions user. + +name: Adversarial Review (Gadfly) + +on: + pull_request: + types: [opened, reopened, ready_for_review] + issue_comment: + types: [created] + workflow_dispatch: + inputs: + pr_number: + description: "PR number to review" + required: true + +permissions: + contents: read + issues: write + pull-requests: write + +concurrency: + group: gadfly-${{ github.event.issue.number || github.event.pull_request.number || github.event.inputs.pr_number }} + cancel-in-progress: true + +jobs: + review: + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: docker://gitea.stevedudenhoeffer.com/steve/gadfly:v1 + env: + GITEA_API: ${{ github.server_url }}/api/v1/repos/${{ github.repository }} + GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + OLLAMA_CLOUD_API_KEY: ${{ secrets.OLLAMA_CLOUD_API_KEY }} + OLLAMA_REVIEW_MODELS: ${{ vars.OLLAMA_REVIEW_MODELS }} + GADFLY_ALLOWED_USERS: ${{ vars.GADFLY_ALLOWED_USERS }} + EVENT_NAME: ${{ github.event_name }} + PR: ${{ github.event.pull_request.number || github.event.issue.number || github.event.inputs.pr_number }} + PR_BRANCH: ${{ github.head_ref }} + IS_DRAFT: ${{ github.event.pull_request.draft }} + COMMENT_BODY: ${{ github.event.comment.body }} + COMMENT_ID: ${{ github.event.comment.id }} + ACTOR: ${{ github.actor }} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..42718ee --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module gitea.stevedudenhoeffer.com/steve/gadfly + +go 1.26.2 + +require gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260610113006-0147a79d187b diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..15f9b96 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260610113006-0147a79d187b h1:/pglCqQW02kV2p9tKyQpIJoXZK2p7LKLeDCZL/V26MM= +gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260610113006-0147a79d187b/go.mod h1:UZLveG17SmENt4sne2RSLIbioix30RZbRIQUzBAnOyY= diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100644 index 0000000..8a22293 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash +# Adversarial PR review runner. +# +# Fetches a PR's unified diff + metadata from Gitea, asks ONE model to review it +# adversarially, then upserts the result as a single labeled PR comment (so +# re-runs on new commits update the comment in place instead of stacking dupes). +# +# The ollama lane is AGENTIC: it runs the cmd/gadfly Go binary, which drives a +# tool-using agent (majordomo + Ollama Cloud) over the PR's checked-out repo so +# the model can read_file/grep/etc. to VERIFY findings instead of guessing from +# the diff alone. The antigravity lane stays a one-shot `agy` call (agy has its +# own file tools). +# +# Required env: +# GITEA_API e.g. https://gitea.stevedudenhoeffer.com/api/v1/repos/steve/mort +# GITEA_TOKEN token with repo write access (posts the comment) +# PR pull request index/number +# PROVIDER "ollama" | "antigravity" +# MODEL model id (e.g. qwen3-coder:480b-cloud, gemini-3-pro) +# +# Provider-specific env: +# ollama: OLLAMA_CLOUD_API_KEY, GADFLY_BIN (path to the built reviewer), +# GADFLY_REPO_DIR (checked-out repo; default: this script's repo) +# antigravity: `agy` on PATH with credentials already seeded (~/.gemini) +# +# Optional: +# MAX_DIFF_CHARS diff truncation cap for the prompt (default 60000) +# +# This script is advisory: it never fails the job for review content. It exits +# non-zero only on a usage/configuration error. +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MAX_DIFF_CHARS="${MAX_DIFF_CHARS:-60000}" + +: "${GITEA_API:?GITEA_API required}" +: "${GITEA_TOKEN:?GITEA_TOKEN required}" +: "${PR:?PR required}" +: "${PROVIDER:?PROVIDER required}" +: "${MODEL:?MODEL required}" + +MARKER="" +say() { echo "[gadfly-review:${PROVIDER}:${MODEL}] $*" >&2; } + +# jq is required for payload building / response parsing; install if missing. +if ! command -v jq >/dev/null 2>&1; then + say "jq not found; attempting install" + { apt-get update -qq && apt-get install -y -qq jq; } >/dev/null 2>&1 \ + || { sudo apt-get update -qq && sudo apt-get install -y -qq jq; } >/dev/null 2>&1 \ + || { say "could not install jq"; exit 1; } +fi + +# curl timeouts: Gitea API calls are quick. Word-split on purpose so the flags +# expand as separate args. (The LLM call's own deadline lives in the reviewer +# binary / agy, not here.) +API_TIMEOUT="--connect-timeout 20 --max-time 30" + +# --- fetch PR context ------------------------------------------------------- +say "fetching PR #${PR} context" +DIFF="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_API}/pulls/${PR}.diff" || true)" +META="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_API}/pulls/${PR}" || echo '{}')" +TITLE="$(echo "$META" | jq -r '.title // ""')" +BODY="$(echo "$META" | jq -r '.body // ""')" + +if [ -z "$DIFF" ]; then + say "empty diff; nothing to review" + exit 0 +fi + +# Keep the FULL diff for the agentic (ollama) reviewer — it can pull the whole +# thing via the get_diff tool and embeds a truncated copy in the prompt itself. +# The truncated copy below is only for the one-shot antigravity prompt. +FULL_DIFF="$DIFF" +TRUNC_NOTE="" +if [ "${#DIFF}" -gt "$MAX_DIFF_CHARS" ]; then + DIFF="${DIFF:0:$MAX_DIFF_CHARS}" + TRUNC_NOTE=$'\n\n[NOTE: diff truncated to '"${MAX_DIFF_CHARS}"' chars for length; review the rest manually.]' +fi + +SYS="$(cat "${SCRIPT_DIR}/system-prompt.txt")" +USR="$(printf 'PR #%s: %s\n\nDescription:\n%s\n\nUnified diff to review:\n```diff\n%s\n```%s' \ + "$PR" "$TITLE" "$BODY" "$DIFF" "$TRUNC_NOTE")" + +# --- call the model --------------------------------------------------------- +REVIEW="" +case "$PROVIDER" in + ollama) + # Agentic lane: hand off to the cmd/gadfly binary, which runs a tool-using + # agent over the checked-out repo so it can verify findings instead of + # guessing from the diff. The workflow builds the binary and exports + # GADFLY_BIN + GADFLY_REPO_DIR; we fall back to sane defaults for a + # local run. + if [ -z "${OLLAMA_CLOUD_API_KEY:-}" ]; then + REVIEW="⚠️ \`OLLAMA_CLOUD_API_KEY\` is not configured; this reviewer was skipped." + else + BIN="${GADFLY_BIN:-gadfly}" + if ! command -v "$BIN" >/dev/null 2>&1 && [ ! -x "$BIN" ]; then + REVIEW="⚠️ Agentic reviewer binary not found (\`GADFLY_BIN=${BIN}\`); the workflow build step may have failed." + else + REPO_DIR="${GADFLY_REPO_DIR:-$(cd "${SCRIPT_DIR}/../../.." && pwd)}" + DIFF_FILE="$(mktemp)" + ERR_FILE="${DIFF_FILE}.err" + printf '%s' "$FULL_DIFF" > "$DIFF_FILE" + REVIEW="$( + OLLAMA_API_KEY="$OLLAMA_CLOUD_API_KEY" \ + GADFLY_MODEL="$MODEL" \ + GADFLY_REPO_DIR="$REPO_DIR" \ + GADFLY_DIFF_FILE="$DIFF_FILE" \ + GADFLY_SYSTEM_FILE="${SCRIPT_DIR}/system-prompt.txt" \ + GADFLY_TITLE="$TITLE" \ + GADFLY_BODY="$BODY" \ + GADFLY_MAX_DIFF_CHARS="$MAX_DIFF_CHARS" \ + "$BIN" 2>"$ERR_FILE" + )" + rc=$? + if [ "$rc" -ne 0 ] || [ -z "$REVIEW" ]; then + REVIEW="⚠️ Agentic reviewer for \`${MODEL}\` failed (exit ${rc}): +\`\`\` +$(tail -c 1500 "$ERR_FILE" 2>/dev/null) +\`\`\`" + fi + rm -f "$DIFF_FILE" "$ERR_FILE" + fi + fi + ;; + antigravity) + if ! command -v agy >/dev/null 2>&1; then + REVIEW="⚠️ Antigravity CLI (\`agy\`) not found on PATH." + else + FULL="$(printf '%s\n\n%s' "$SYS" "$USR")" + if ! REVIEW="$(agy -p "$FULL" --model "$MODEL" 2>agy.err)"; then + REVIEW="⚠️ Antigravity CLI failed: +\`\`\` +$(tail -c 1500 agy.err 2>/dev/null) +\`\`\`" + fi + [ -z "$REVIEW" ] && REVIEW="⚠️ Antigravity CLI returned no output (auth/quota?)." + fi + ;; + *) + say "unknown provider: ${PROVIDER}"; exit 1 ;; +esac + +# --- assemble comment ------------------------------------------------------- +COMMENT="$(printf '%s\n### 🔭 Adversarial review — `%s` (%s)\n\n%s\n\nAutomated adversarial review. Advisory only — does not block merge.' \ + "$MARKER" "$MODEL" "$PROVIDER" "$REVIEW")" +POST_BODY="$(jq -n --arg b "$COMMENT" '{body:$b}')" + +# --- upsert by marker ------------------------------------------------------- +EXISTING_ID="" +page=1 +while [ "$page" -le 10 ]; do + CMTS="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" || echo '[]')" + [ "$(echo "$CMTS" | jq 'length')" = "0" ] && break + EXISTING_ID="$(echo "$CMTS" | jq -r --arg m "$MARKER" \ + '.[] | select(.body != null and (.body | startswith($m))) | .id' | head -n1)" + [ -n "$EXISTING_ID" ] && break + page=$((page+1)) +done + +if [ -n "$EXISTING_ID" ]; then + say "updating existing comment ${EXISTING_ID}" + curl $API_TIMEOUT -sS -X PATCH -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/comments/${EXISTING_ID}" -d "$POST_BODY" >/dev/null +else + say "creating new comment" + curl $API_TIMEOUT -sS -X POST -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/${PR}/comments" -d "$POST_BODY" >/dev/null +fi +say "done" diff --git a/scripts/system-prompt.txt b/scripts/system-prompt.txt new file mode 100644 index 0000000..ef7787d --- /dev/null +++ b/scripts/system-prompt.txt @@ -0,0 +1,47 @@ +You are Gadfly, an ADVERSARIAL code reviewer. Your job is to find real problems in the +pull request below — not to praise it. A gadfly does not let things slide. + +You are AGENTIC: you have read-only tools over the repository AT THIS PR's checked-out +state. USE THEM to verify before you report. Do not review the diff in isolation. +- read_file(path[, start_line, limit]) — read a file with line numbers. +- list_dir([path]) — list a directory. +- grep(pattern[, path, max_results]) — RE2 regex search across the repo. +- find_files(name[, max_results]) — locate a file by path substring. +- get_diff() — the full unified diff (the task message may truncate it). + +Mandatory verification discipline — this is the whole point of giving you tools: +- Before claiming a missing/duplicate import, an undefined symbol, a wrong signature, + a type error, or any "this won't compile / won't resolve" issue: OPEN the file and + CHECK. The diff hunk shows only a few context lines; the declaration you're worried + about is almost always just outside it. +- Before claiming a cross-file problem (a caller you think you broke, a missing update + to another layer/interface): grep for the symbol and read the other side. +- If you cannot confirm a suspicion with the tools, either drop it or clearly label it + "unverified" — do NOT present an unchecked guess as a finding. + +Be skeptical and concrete. Hunt specifically for: +- Correctness bugs and logic errors introduced by the change. +- SEMANTIC / domain correctness — the failure mode plausible-looking code hides best. + Do NOT trust a constant, conversion factor, formula, unit, or threshold just because + it looks reasonable. Independently RE-DERIVE the expected value from first principles + (units, dimensions, edge values) and compare. A magic number that "looks about right" + is exactly where real bugs hide (e.g. a linear factor used where it must be squared). +- Concurrency issues: data races, deadlocks, unsynchronized shared state, leaked tasks. +- Security problems: injection, missing authz/authn, secret leakage, unsafe input handling. +- Error handling gaps: ignored errors, swallowed exceptions, missing rollback/cleanup. +- Resource leaks: unclosed handles/bodies/files, context/lifetime misuse, unbounded growth. +- Missed edge cases: off-by-one, nil/null, empty collection, overflow, zero/negative. +- Violations of THIS repo's own conventions. Discover them — do not assume. Read any + README / CONTRIBUTING / CLAUDE.md / AGENTS.md / lint config the repo ships, and hold + the change to the patterns the surrounding code actually uses. + +Output rules: +- Output GitHub-flavored markdown, concise. No filler, no restating the diff. +- Lead with a one-line VERDICT: exactly one of "No material issues found", + "Minor issues", or "Blocking issues found". +- Then a short bulleted list of findings. For each finding cite `path:line` and explain + the concrete impact and a suggested fix. Note which findings you verified by reading + the code (and how) versus any you could not confirm. +- Only report issues you are reasonably confident are real after checking. If the diff + is clean, say so plainly rather than inventing nits. +- When you are done investigating, STOP calling tools and reply with the final review.