diff --git a/cmd/gadfly/consolidate.go b/cmd/gadfly/consolidate.go index d354335..7f8fb6c 100644 --- a/cmd/gadfly/consolidate.go +++ b/cmd/gadfly/consolidate.go @@ -49,6 +49,7 @@ type specialistResult struct { spec Specialist out string verdict verdict + errored bool // the review pass failed (timeout/model error) β€” not a clean result } // worstVerdict returns the most severe verdict across results. The optional @@ -70,17 +71,36 @@ func worstVerdict(results []specialistResult) verdict { // followed by one verbatim section per specialist. run.sh wraps this with the // "πŸͺ° Gadfly review β€” " header and the advisory footer. func renderConsolidated(results []specialistResult) string { + errored := 0 + for _, r := range results { + if r.errored { + errored++ + } + } + + headline := "Verdict: " + worstVerdict(results).label() + if len(results) > 0 && errored == len(results) { + // Every lens errored β€” do NOT report this as "clean". + headline = "Review incomplete β€” all lenses errored" + } else if errored > 0 { + headline += fmt.Sprintf(" Β· ⚠️ %d/%d lens(es) errored", errored, len(results)) + } + var b strings.Builder - fmt.Fprintf(&b, "**Verdict: %s** β€” %d reviewers: %s\n", - worstVerdict(results).label(), len(results), strings.Join(specialistNames(results), ", ")) + fmt.Fprintf(&b, "**%s** β€” %d reviewers: %s\n", + headline, len(results), strings.Join(specialistNames(results), ", ")) for _, r := range results { body := strings.TrimSpace(r.out) if body == "" { body = "_(no output)_" } + summary := r.verdict.label() + if r.errored { + summary = "⚠️ could not complete" + } fmt.Fprintf(&b, "\n
%s β€” %s\n\n%s\n\n
\n", - r.spec.Title, r.verdict.label(), body) + r.spec.Title, summary, body) } return strings.TrimRight(b.String(), "\n") } diff --git a/cmd/gadfly/main.go b/cmd/gadfly/main.go index e61fb34..5aae508 100644 --- a/cmd/gadfly/main.go +++ b/cmd/gadfly/main.go @@ -69,10 +69,14 @@ import ( const ( defaultMaxSteps = 24 - // defaultTimeoutSecs is the overall deadline shared by ALL specialist passes - // in one invocation; the default suite runs several lenses sequentially. - defaultTimeoutSecs = 600 + // defaultTimeoutSecs is the deadline for EACH specialist's passes (review + + // recheck). It is per-lens, not shared across the suite, so one slow lens + // (e.g. a big local model) can't starve the others. Slow local models may + // need this raised (and a higher job timeout to match the suite total). + defaultTimeoutSecs = 300 defaultMaxDiffChars = 60000 + // autoSelectTimeout bounds the dynamic specialist-selection call. + autoSelectTimeout = 120 * time.Second // defaultWrapUpReserve is how many steps before the cap the agent is told // to stop investigating and write its final answer. Reserving a margin is // what keeps a thorough reviewer from spending its whole budget on tool @@ -148,17 +152,15 @@ func run() error { fmt.Fprintln(os.Stderr, "gadfly:", e) } - timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - // Dynamic selection: a (cheap) model picks the lenses this diff needs. if auto { selector, serr := resolveSelectorModel(mdl) if serr != nil { return fmt.Errorf("resolve selector model: %w", serr) } - picked, aerr := autoSelectSpecialists(ctx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry) + selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout) + picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry) + cancel() if aerr != nil { fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr) specialists = suiteFromRegistry(registry, defaultSuite) @@ -176,24 +178,29 @@ func run() error { task := buildTask(diff) results := make([]specialistResult, 0, len(specialists)) for _, sp := range specialists { - out := reviewWithSpecialist(ctx, mdl, fsTools, base, sp, task, diff) - results = append(results, specialistResult{spec: sp, out: out, verdict: parseVerdict(out)}) + out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff) + results = append(results, specialistResult{spec: sp, out: out, verdict: parseVerdict(out), errored: errored}) } fmt.Println(renderConsolidated(results)) return nil } -// reviewWithSpecialist runs one lens end-to-end: a review pass under the -// specialist's composed prompt, then the shared adversarial recheck pass. A -// failed pass is rendered as an inline notice (advisory β€” one lens failing -// never sinks the others or the job). -func reviewWithSpecialist(ctx context.Context, mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) string { +// reviewWithSpecialist runs one lens end-to-end under its OWN timeout, so a slow +// model on one lens can't starve the others: a review pass under the +// specialist's composed prompt, then the shared adversarial recheck pass. The +// returned bool is true when the review pass failed (rendered as an inline +// notice β€” advisory; one lens failing never sinks the others or the job). +func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) (string, bool) { + timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + draft, err := runAgent(ctx, mdl, fsTools, composeSpecialistPrompt(base, sp), task, envInt("GADFLY_MAX_STEPS", defaultMaxSteps)) if err != nil { fmt.Fprintf(os.Stderr, "gadfly: specialist %q review pass failed: %v\n", sp.Name, err) - return fmt.Sprintf("⚠️ This reviewer failed to complete: %v", err) + return fmt.Sprintf("⚠️ This reviewer failed to complete: %v", err), true } final := draft @@ -206,7 +213,7 @@ func reviewWithSpecialist(ctx context.Context, mdl llm.Model, fsTools *repoFS, b final = rechecked } } - return final + return final, false } // runAgent runs one agent pass (its own fresh toolbox over the sandbox) and diff --git a/cmd/gadfly/specialists.go b/cmd/gadfly/specialists.go index f7b58e4..eafa255 100644 --- a/cmd/gadfly/specialists.go +++ b/cmd/gadfly/specialists.go @@ -192,8 +192,13 @@ func specialistNamesOf(specs []Specialist) []string { // composeSpecialistPrompt appends a specialist's lens to the base system prompt. func composeSpecialistPrompt(base string, sp Specialist) string { return strings.TrimRight(base, "\n") + - "\n\n## Your review lens: " + sp.Title + "\n" + - "Focus your review on this lens; other reviewers cover the rest. " + sp.Focus + "\n\n## Your assigned lens β€” " + sp.Title + "\n" + + "Review the change specifically and ONLY through this lens. Scrutinize it for:\n" + + sp.Focus + + "\n\nStay in this lane: other lenses (correctness, security, performance, etc.) are reviewed " + + "separately, so don't duplicate their findings here. If nothing in your lens is materially " + + "wrong, reply with the \"No material issues found\" verdict for this lens β€” do not reach for " + + "another lens's issue just to have something to say." } func loadFileConfig(repoDir string) (fileConfig, bool, error) { diff --git a/scripts/run.sh b/scripts/run.sh index 63b87a0..2cbb097 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -42,6 +42,14 @@ MAX_DIFF_CHARS="${MAX_DIFF_CHARS:-60000}" MARKER="" say() { echo "[gadfly-review:${PROVIDER}:${MODEL}] $*" >&2; } +# Display the model's ACTUAL backend: the provider segment of the spec +# ("m1pro/qwen3.6:35b-mlx" -> "m1pro"); a bare id uses GADFLY_PROVIDER (default +# ollama-cloud). This is what the comment header shows, not the run.sh lane. +case "$MODEL" in + */*) MODEL_PROVIDER="${MODEL%%/*}" ;; + *) MODEL_PROVIDER="${GADFLY_PROVIDER:-ollama-cloud}" ;; +esac + # jq is required for payload building / response parsing; install if missing. if ! command -v jq >/dev/null 2>&1; then say "jq not found; attempting install" @@ -55,6 +63,32 @@ fi # binary / agy, not here.) API_TIMEOUT="--connect-timeout 20 --max-time 30" +# upsert_comment BODY β€” create or update (by MARKER) this model's single comment. +upsert_comment() { + local body="$1" post_body existing_id page=1 cmts + post_body="$(jq -n --arg b "$body" '{body:$b}')" + existing_id="" + while [ "$page" -le 10 ]; do + cmts="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" || echo '[]')" + [ "$(echo "$cmts" | jq 'length')" = "0" ] && break + existing_id="$(echo "$cmts" | jq -r --arg m "$MARKER" \ + '.[] | select(.body != null and (.body | startswith($m))) | .id' | head -n1)" + [ -n "$existing_id" ] && break + page=$((page+1)) + done + if [ -n "$existing_id" ]; then + curl $API_TIMEOUT -sS -X PATCH -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/comments/${existing_id}" -d "$post_body" >/dev/null + else + curl $API_TIMEOUT -sS -X POST -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ + "${GITEA_API}/issues/${PR}/comments" -d "$post_body" >/dev/null + fi +} + +# fmt_duration SECONDS -> "1m 23s" / "45s" +fmt_duration() { if [ "$1" -ge 60 ]; then echo "$(($1/60))m $(($1%60))s"; else echo "$1s"; fi; } + # --- fetch PR context ------------------------------------------------------- say "fetching PR #${PR} context" DIFF="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_API}/pulls/${PR}.diff" || true)" @@ -81,6 +115,12 @@ SYS="$(cat "${SCRIPT_DIR}/system-prompt.txt")" USR="$(printf 'PR #%s: %s\n\nDescription:\n%s\n\nUnified diff to review:\n```diff\n%s\n```%s' \ "$PR" "$TITLE" "$BODY" "$DIFF" "$TRUNC_NOTE")" +# --- announce start (placeholder comment) ----------------------------------- +START_TS="$(date +%s)" +say "starting review with ${MODEL}" +upsert_comment "$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n⏳ Reviewing… this comment will update with findings and run time.' \ + "$MARKER" "$MODEL" "$MODEL_PROVIDER")" + # --- call the model --------------------------------------------------------- REVIEW="" case "$PROVIDER" in @@ -96,7 +136,7 @@ case "$PROVIDER" in if [ -n "${OLLAMA_CLOUD_API_KEY:-}" ] && [ -z "${OLLAMA_API_KEY:-}" ]; then export OLLAMA_API_KEY="$OLLAMA_CLOUD_API_KEY" fi - GADFLY_PROVIDER_EFF="${GADFLY_PROVIDER:-ollama-cloud}" + GADFLY_PROVIDER_EFF="$MODEL_PROVIDER" # Only the default cloud provider strictly needs a key up front; local Ollama # and other providers either need none or read their own standard env var. @@ -152,31 +192,10 @@ $(tail -c 1500 agy.err 2>/dev/null) say "unknown provider: ${PROVIDER}"; exit 1 ;; esac -# --- assemble comment ------------------------------------------------------- -COMMENT="$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β€” does not block merge.' \ - "$MARKER" "$MODEL" "${GADFLY_PROVIDER_EFF:-$PROVIDER}" "$REVIEW")" -POST_BODY="$(jq -n --arg b "$COMMENT" '{body:$b}')" - -# --- upsert by marker ------------------------------------------------------- -EXISTING_ID="" -page=1 -while [ "$page" -le 10 ]; do - CMTS="$(curl $API_TIMEOUT -fsS -H "Authorization: token ${GITEA_TOKEN}" \ - "${GITEA_API}/issues/${PR}/comments?limit=50&page=${page}" || echo '[]')" - [ "$(echo "$CMTS" | jq 'length')" = "0" ] && break - EXISTING_ID="$(echo "$CMTS" | jq -r --arg m "$MARKER" \ - '.[] | select(.body != null and (.body | startswith($m))) | .id' | head -n1)" - [ -n "$EXISTING_ID" ] && break - page=$((page+1)) -done - -if [ -n "$EXISTING_ID" ]; then - say "updating existing comment ${EXISTING_ID}" - curl $API_TIMEOUT -sS -X PATCH -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ - "${GITEA_API}/issues/comments/${EXISTING_ID}" -d "$POST_BODY" >/dev/null -else - say "creating new comment" - curl $API_TIMEOUT -sS -X POST -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ - "${GITEA_API}/issues/${PR}/comments" -d "$POST_BODY" >/dev/null -fi -say "done" +# --- assemble + post final comment (with run time) -------------------------- +ELAPSED="$(( $(date +%s) - START_TS ))" +DUR="$(fmt_duration "$ELAPSED")" +COMMENT="$(printf '%s\n### πŸͺ° Gadfly review β€” `%s` (%s)\n\n%s\n\nAutomated adversarial review by Gadfly. Advisory only β€” does not block merge. Β· ⏱️ reviewed in %s' \ + "$MARKER" "$MODEL" "$MODEL_PROVIDER" "$REVIEW" "$DUR")" +upsert_comment "$COMMENT" +say "done in ${DUR}" diff --git a/scripts/system-prompt.txt b/scripts/system-prompt.txt index 169af2f..28960c8 100644 --- a/scripts/system-prompt.txt +++ b/scripts/system-prompt.txt @@ -1,9 +1,14 @@ You are Gadfly, an ADVERSARIAL code reviewer. Your job is to find real problems in the pull request below β€” not to praise it. A gadfly does not let things slide. -You review through ONE assigned lens (given at the end of this prompt). Stay in your lane β€” -other reviewers cover the other angles β€” but do report anything clearly serious you happen -to notice. +You review through ONE assigned lens (named at the end of this prompt). Evaluate the change +THROUGH THAT LENS β€” that is your job. A separate reviewer independently covers each other +angle, so problems outside your lens WILL be caught without you. Do not restate a finding that +plainly belongs to another lens just to have something to report β€” that only creates noise. If +your lens turns up nothing material, say so plainly; an honest "nothing in my area" beats +re-reporting the obvious bug every other lens already sees. Only exception: if you spot a +SEVERE issue clearly outside your lens, you may add ONE line prefixed "Outside my lens:" β€” but +your actual findings must stay within your lens. You are AGENTIC: you have read-only tools over the repository AT THIS PR's checked-out state. USE THEM to verify before you report. Do not review the diff in isolation. @@ -23,10 +28,10 @@ Mandatory verification discipline β€” this is the whole point of giving you tool - If you cannot confirm a suspicion with the tools, either drop it or clearly label it "unverified" β€” do NOT present an unchecked guess as a finding. -Be skeptical and concrete, and apply your assigned lens rigorously. A recurring, high-value -discipline regardless of lens: do NOT trust a constant, conversion factor, formula, unit, or -threshold just because it looks reasonable β€” RE-DERIVE the expected value from first principles -and compare. Plausible-looking magic numbers are where real bugs hide. +Be skeptical and concrete, and apply your assigned lens rigorously. (If your lens leads you to +a constant, conversion factor, formula, unit, or threshold, don't trust it because it looks +reasonable β€” re-derive the expected value from first principles and compare; plausible-looking +magic numbers hide real bugs. Pursue this when it's in your lane, not as a reason to leave it.) Output rules: - Output GitHub-flavored markdown, concise. No filler, no restating the diff.