fix: per-lens timeout, errored-verdict honesty, accurate provider label, tighter lens focus, run timing
Build & push image / build-and-push (push) Successful in 8s
Build & push image / build-and-push (push) Successful in 8s
Five fixes, several surfaced by the live bake-off: - PER-LENS TIMEOUT (critical): GADFLY_TIMEOUT_SECS now applies to EACH specialist (own context), not shared across the suite. A slow model (e.g. a 35B local MLX) was exhausting the whole 600s budget on lens 1, leaving the rest "step 0: context deadline exceeded". Default lowered to 300s (per-lens). cmd/gadfly/main.go. - ERRORED VERDICT: a lens whose review pass failed no longer counts as "clean". Header shows "· ⚠️ N/M lens(es) errored" (or "Review incomplete — all lenses errored"); the section reads "⚠️ could not complete". consolidate.go. - PROVIDER LABEL: the comment header now shows the model's ACTUAL backend from the spec ("m1pro/qwen3.6:35b-mlx" -> m1pro), not the global GADFLY_PROVIDER default (was wrongly "ollama-cloud" for local models). scripts/run.sh. - LENS FOCUS: base prompt no longer licenses "report anything serious"; each lens stays in its lane, says "nothing in my area" rather than re-reporting another lens's bug, with a one-line "Outside my lens:" escape hatch. The re-derive- constants discipline is now lane-scoped, not "every lens". system-prompt.txt + specialists.go. - RUN TIMING: run.sh posts a "⏳ Reviewing…" placeholder at model start and updates it with "⏱️ reviewed in 1m 23s" on finish, for per-model comparison. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,7 @@ type specialistResult struct {
|
||||
spec Specialist
|
||||
out string
|
||||
verdict verdict
|
||||
errored bool // the review pass failed (timeout/model error) — not a clean result
|
||||
}
|
||||
|
||||
// worstVerdict returns the most severe verdict across results. The optional
|
||||
@@ -70,17 +71,36 @@ func worstVerdict(results []specialistResult) verdict {
|
||||
// followed by one verbatim section per specialist. run.sh wraps this with the
|
||||
// "🪰 Gadfly review — <model>" header and the advisory footer.
|
||||
func renderConsolidated(results []specialistResult) string {
|
||||
errored := 0
|
||||
for _, r := range results {
|
||||
if r.errored {
|
||||
errored++
|
||||
}
|
||||
}
|
||||
|
||||
headline := "Verdict: " + worstVerdict(results).label()
|
||||
if len(results) > 0 && errored == len(results) {
|
||||
// Every lens errored — do NOT report this as "clean".
|
||||
headline = "Review incomplete — all lenses errored"
|
||||
} else if errored > 0 {
|
||||
headline += fmt.Sprintf(" · ⚠️ %d/%d lens(es) errored", errored, len(results))
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "**Verdict: %s** — %d reviewers: %s\n",
|
||||
worstVerdict(results).label(), len(results), strings.Join(specialistNames(results), ", "))
|
||||
fmt.Fprintf(&b, "**%s** — %d reviewers: %s\n",
|
||||
headline, len(results), strings.Join(specialistNames(results), ", "))
|
||||
|
||||
for _, r := range results {
|
||||
body := strings.TrimSpace(r.out)
|
||||
if body == "" {
|
||||
body = "_(no output)_"
|
||||
}
|
||||
summary := r.verdict.label()
|
||||
if r.errored {
|
||||
summary = "⚠️ could not complete"
|
||||
}
|
||||
fmt.Fprintf(&b, "\n<details><summary><b>%s</b> — %s</summary>\n\n%s\n\n</details>\n",
|
||||
r.spec.Title, r.verdict.label(), body)
|
||||
r.spec.Title, summary, body)
|
||||
}
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
+24
-17
@@ -69,10 +69,14 @@ import (
|
||||
|
||||
const (
|
||||
defaultMaxSteps = 24
|
||||
// defaultTimeoutSecs is the overall deadline shared by ALL specialist passes
|
||||
// in one invocation; the default suite runs several lenses sequentially.
|
||||
defaultTimeoutSecs = 600
|
||||
// defaultTimeoutSecs is the deadline for EACH specialist's passes (review +
|
||||
// recheck). It is per-lens, not shared across the suite, so one slow lens
|
||||
// (e.g. a big local model) can't starve the others. Slow local models may
|
||||
// need this raised (and a higher job timeout to match the suite total).
|
||||
defaultTimeoutSecs = 300
|
||||
defaultMaxDiffChars = 60000
|
||||
// autoSelectTimeout bounds the dynamic specialist-selection call.
|
||||
autoSelectTimeout = 120 * time.Second
|
||||
// defaultWrapUpReserve is how many steps before the cap the agent is told
|
||||
// to stop investigating and write its final answer. Reserving a margin is
|
||||
// what keeps a thorough reviewer from spending its whole budget on tool
|
||||
@@ -148,17 +152,15 @@ func run() error {
|
||||
fmt.Fprintln(os.Stderr, "gadfly:", e)
|
||||
}
|
||||
|
||||
timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
// Dynamic selection: a (cheap) model picks the lenses this diff needs.
|
||||
if auto {
|
||||
selector, serr := resolveSelectorModel(mdl)
|
||||
if serr != nil {
|
||||
return fmt.Errorf("resolve selector model: %w", serr)
|
||||
}
|
||||
picked, aerr := autoSelectSpecialists(ctx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry)
|
||||
selCtx, cancel := context.WithTimeout(context.Background(), autoSelectTimeout)
|
||||
picked, aerr := autoSelectSpecialists(selCtx, selector, os.Getenv("GADFLY_TITLE"), os.Getenv("GADFLY_BODY"), diff, registry)
|
||||
cancel()
|
||||
if aerr != nil {
|
||||
fmt.Fprintln(os.Stderr, "gadfly: auto-select failed; falling back to the default suite:", aerr)
|
||||
specialists = suiteFromRegistry(registry, defaultSuite)
|
||||
@@ -176,24 +178,29 @@ func run() error {
|
||||
task := buildTask(diff)
|
||||
results := make([]specialistResult, 0, len(specialists))
|
||||
for _, sp := range specialists {
|
||||
out := reviewWithSpecialist(ctx, mdl, fsTools, base, sp, task, diff)
|
||||
results = append(results, specialistResult{spec: sp, out: out, verdict: parseVerdict(out)})
|
||||
out, errored := reviewWithSpecialist(mdl, fsTools, base, sp, task, diff)
|
||||
results = append(results, specialistResult{spec: sp, out: out, verdict: parseVerdict(out), errored: errored})
|
||||
}
|
||||
|
||||
fmt.Println(renderConsolidated(results))
|
||||
return nil
|
||||
}
|
||||
|
||||
// reviewWithSpecialist runs one lens end-to-end: a review pass under the
|
||||
// specialist's composed prompt, then the shared adversarial recheck pass. A
|
||||
// failed pass is rendered as an inline notice (advisory — one lens failing
|
||||
// never sinks the others or the job).
|
||||
func reviewWithSpecialist(ctx context.Context, mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) string {
|
||||
// reviewWithSpecialist runs one lens end-to-end under its OWN timeout, so a slow
|
||||
// model on one lens can't starve the others: a review pass under the
|
||||
// specialist's composed prompt, then the shared adversarial recheck pass. The
|
||||
// returned bool is true when the review pass failed (rendered as an inline
|
||||
// notice — advisory; one lens failing never sinks the others or the job).
|
||||
func reviewWithSpecialist(mdl llm.Model, fsTools *repoFS, base string, sp Specialist, task, diff string) (string, bool) {
|
||||
timeout := time.Duration(envInt("GADFLY_TIMEOUT_SECS", defaultTimeoutSecs)) * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
draft, err := runAgent(ctx, mdl, fsTools, composeSpecialistPrompt(base, sp), task,
|
||||
envInt("GADFLY_MAX_STEPS", defaultMaxSteps))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "gadfly: specialist %q review pass failed: %v\n", sp.Name, err)
|
||||
return fmt.Sprintf("⚠️ This reviewer failed to complete: %v", err)
|
||||
return fmt.Sprintf("⚠️ This reviewer failed to complete: %v", err), true
|
||||
}
|
||||
|
||||
final := draft
|
||||
@@ -206,7 +213,7 @@ func reviewWithSpecialist(ctx context.Context, mdl llm.Model, fsTools *repoFS, b
|
||||
final = rechecked
|
||||
}
|
||||
}
|
||||
return final
|
||||
return final, false
|
||||
}
|
||||
|
||||
// runAgent runs one agent pass (its own fresh toolbox over the sandbox) and
|
||||
|
||||
@@ -192,8 +192,13 @@ func specialistNamesOf(specs []Specialist) []string {
|
||||
// composeSpecialistPrompt appends a specialist's lens to the base system prompt.
|
||||
func composeSpecialistPrompt(base string, sp Specialist) string {
|
||||
return strings.TrimRight(base, "\n") +
|
||||
"\n\n## Your review lens: " + sp.Title + "\n" +
|
||||
"Focus your review on this lens; other reviewers cover the rest. " + sp.Focus
|
||||
"\n\n## Your assigned lens — " + sp.Title + "\n" +
|
||||
"Review the change specifically and ONLY through this lens. Scrutinize it for:\n" +
|
||||
sp.Focus +
|
||||
"\n\nStay in this lane: other lenses (correctness, security, performance, etc.) are reviewed " +
|
||||
"separately, so don't duplicate their findings here. If nothing in your lens is materially " +
|
||||
"wrong, reply with the \"No material issues found\" verdict for this lens — do not reach for " +
|
||||
"another lens's issue just to have something to say."
|
||||
}
|
||||
|
||||
func loadFileConfig(repoDir string) (fileConfig, bool, error) {
|
||||
|
||||
Reference in New Issue
Block a user