From 66682aa1d5ccc3e6d8537899896f7e9c6027cd59 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 28 Jun 2026 18:22:43 -0400 Subject: [PATCH] fix: address swarm review of the structured-findings contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From PR #16's own adversarial review (graded in gadfly-reports): - Empty/empty-result fallback: emit now scrapes the prose when the structured block is absent, unterminated, malformed, OR parses to zero usable findings (e.g. an empty [] emitted alongside real prose) — the safety net is no longer defeated by a contradictory empty block. (extractStructuredFindingsOrScrape) - Unterminated fence no longer swallows the comment: extract + strip share findingsSpan, which only recognizes a TERMINATED block; an unterminated fence is left in place. - confidence is now consumed: normalized (normalizeConfidence) and emitted to the telemetry store (reportPayload.confidence), not write-only. - Canonical raw_severity everywhere: heuristic findings derive a canonical word from the verdict (verdict.severity) instead of the full verdict phrase. - normalizeSeverity table made consistent ("minor"/"low" -> small). - Exact info-string match (isFindingsOpen) instead of substring; shared fenceInfo predicate removes the duplicated open/close fence logic. - Dedupe structured findings by file:line; never emit an empty title (fall back to detail, then the location); validate the parsed line number. - Skip the prose scan when findings already carry detail; refresh stale emit.go doc comments. Tests added for the empty-[] fallback, unterminated-fence strip, confidence normalization, and verdict.severity. All green, gofmt clean, vet quiet. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/gadfly/consolidate.go | 16 +++ cmd/gadfly/emit.go | 31 +++--- cmd/gadfly/emit_test.go | 9 +- cmd/gadfly/findings.go | 187 +++++++++++++++++++++++------------- cmd/gadfly/findings_test.go | 61 +++++++++--- 5 files changed, 210 insertions(+), 94 deletions(-) diff --git a/cmd/gadfly/consolidate.go b/cmd/gadfly/consolidate.go index cf254c8..d51d308 100644 --- a/cmd/gadfly/consolidate.go +++ b/cmd/gadfly/consolidate.go @@ -28,6 +28,22 @@ func (v verdict) label() string { } } +// severity maps a verdict to a canonical severity word (the same vocabulary as +// the structured findings: critical/high/medium/small/trivial). Used as the +// raw_severity for heuristic-scraped findings, which carry no per-finding +// severity of their own — so the telemetry store sees one consistent vocabulary +// instead of a mix of canonical words and full verdict phrases. +func (v verdict) severity() string { + switch v { + case verdictBlocking: + return "high" + case verdictMinor: + return "small" + default: + return "trivial" + } +} + // parseVerdict extracts a specialist's self-reported verdict from its output. // The base prompt tells each lens to lead with one of the three phrases. func parseVerdict(out string) verdict { diff --git a/cmd/gadfly/emit.go b/cmd/gadfly/emit.go index 5a43ca4..fbac59d 100644 --- a/cmd/gadfly/emit.go +++ b/cmd/gadfly/emit.go @@ -10,11 +10,14 @@ package main // the review markdown consumed by run.sh) and never changes the exit code. // - It depends only on the Go stdlib (net/http). // -// Findings are extracted heuristically from each lens's markdown: a "path:line" -// reference (e.g. run/executor.go:166) anchors a finding, whose title is the -// nearest preceding markdown heading / numbered item / bold lead-in (else the -// first sentence of the finding's paragraph). This is best-effort signal for the -// store to aggregate — not a structured contract the reviewer guarantees. +// Findings come PRIMARILY from each lens's machine-readable ```gadfly-findings +// block (findings.go: exact file/line + per-finding severity/confidence). When a +// model emits no parseable block, emit falls back to a heuristic prose scrape: a +// "path:line" reference (e.g. run/executor.go:166) anchors a finding, whose title +// is the nearest preceding markdown heading / numbered item / bold lead-in (else +// the first sentence of the finding's paragraph). The scrape is best-effort +// signal for the store to aggregate, not a structured contract the reviewer +// guarantees. import ( "bytes" @@ -91,6 +94,7 @@ type reportPayload struct { Provider string `json:"provider"` RunID string `json:"run_id"` RawSeverity string `json:"raw_severity"` + Confidence string `json:"confidence"` // per-finding high/medium/low ("" if heuristic) Detail string `json:"detail"` } @@ -135,17 +139,17 @@ func emit(results []specialistResult, elapsed time.Duration) { continue } // Prefer the model's machine-readable ```gadfly-findings block (exact - // location + per-finding severity); fall back to the prose scrape when a - // model didn't emit a parseable one. - fs, structured := extractStructuredFindings(r.out) - if !structured { - fs = parseFindings(r.spec, r.out) - } - lensSev := r.verdict.label() + // location + per-finding severity). Fall back to the prose scrape when it + // yields NOTHING — no block, an unterminated/malformed one, or a block that + // parsed to zero usable findings on a non-clean lens (e.g. an empty [] + // alongside prose findings). Falling back on empty, not just on absent, + // keeps the safety net from being defeated by a contradictory empty block. + fs := extractStructuredFindingsOrScrape(r) + lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity) for _, f := range fs { sev := f.severity if sev == "" { - sev = lensSev // heuristic finding: best signal is the lens verdict + sev = lensSev } reports = append(reports, reportPayload{ Repo: repo, @@ -158,6 +162,7 @@ func emit(results []specialistResult, elapsed time.Duration) { Provider: provider, RunID: runID, RawSeverity: sev, + Confidence: f.confidence, Detail: f.detail, }) } diff --git a/cmd/gadfly/emit_test.go b/cmd/gadfly/emit_test.go index fe1b29b..aab5b4b 100644 --- a/cmd/gadfly/emit_test.go +++ b/cmd/gadfly/emit_test.go @@ -157,7 +157,7 @@ func TestEmit_PostsRunsAndReports(t *testing.T) { if len(reportBody) != 2 { t.Fatalf("/reports array length = %d, want 2", len(reportBody)) } - for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "detail"} { + for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "confidence", "detail"} { if _, ok := reportBody[0][k]; !ok { t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0])) } @@ -171,8 +171,11 @@ func TestEmit_PostsRunsAndReports(t *testing.T) { if reportBody[0]["line"] != float64(166) { t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"]) } - if reportBody[0]["raw_severity"] != "Blocking issues found" { - t.Errorf("reports[0].raw_severity = %v, want 'Blocking issues found'", reportBody[0]["raw_severity"]) + // No structured block in sampleLensMarkdown, so this is a heuristic-scraped + // finding: raw_severity is the canonical word derived from the lens verdict + // (Blocking -> "high"), not the full verdict phrase. + if reportBody[0]["raw_severity"] != "high" { + t.Errorf("reports[0].raw_severity = %v, want 'high'", reportBody[0]["raw_severity"]) } if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" { t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"]) diff --git a/cmd/gadfly/findings.go b/cmd/gadfly/findings.go index 4e3ba9f..d0a3be7 100644 --- a/cmd/gadfly/findings.go +++ b/cmd/gadfly/findings.go @@ -7,10 +7,10 @@ package main // reliable than scraping prose with a path:line regex (emit.go's heuristic), // and it carries PER-FINDING severity + confidence the prose verdict can't. // -// Everything here degrades gracefully: a missing or malformed block makes -// extractStructuredFindings return ok=false, and the caller falls back to the -// heuristic scrape — so a weak model that ignores the contract still contributes -// findings, exactly as before. +// Everything here degrades gracefully: a missing, unterminated, or malformed +// block makes extractStructuredFindings return ok=false (and yield no findings), +// so the caller falls back to the heuristic scrape — a weak model that ignores +// the contract still contributes findings, exactly as before. import ( "encoding/json" @@ -26,49 +26,68 @@ type structuredFinding struct { Severity string `json:"severity"` Confidence string `json:"confidence"` Title string `json:"title"` - Detail string `json:"detail"` // optional; prose paragraph is used when absent + Detail string `json:"detail"` // optional; the prose paragraph is used when absent } // findingsFence is the info-string that tags the machine-readable block. const findingsFence = "gadfly-findings" // extractStructuredFindings parses the ```gadfly-findings JSON block out of a -// lens's markdown. It returns the findings and ok=true when a block is present -// AND parses as a JSON array (an empty array is a valid "nothing found" — ok is -// still true so the caller does NOT fall back to the regex scrape). A missing or -// unparseable block returns ok=false, signalling the caller to use the heuristic. +// lens's markdown. It returns the findings and ok=true when a TERMINATED block is +// present AND parses as a JSON array (an empty array is valid "nothing found" — +// ok is still true). A missing, unterminated, or unparseable block returns +// ok=false so the caller falls back to the heuristic scrape. // -// detail for each finding is filled from the JSON when present, else from the -// prose paragraph that references the same file:line (best of both: exact -// location + rich human context). Findings with no usable file are dropped. +// Findings are deduped by file:line (keeping the first, matching parseFindings), +// findings with no usable file are dropped, and each title/detail is backfilled +// from the prose when the JSON omits it (best of both: exact location + the human +// context the model already wrote). func extractStructuredFindings(out string) ([]finding, bool) { - body, ok := findingsBlock(out) + lines := strings.Split(out, "\n") + start, end, ok := findingsSpan(lines) if !ok { return nil, false } var raw []structuredFinding - if err := json.Unmarshal([]byte(body), &raw); err != nil { + if err := json.Unmarshal([]byte(strings.Join(lines[start+1:end], "\n")), &raw); err != nil { return nil, false } - prose := proseParagraphs(out) findings := make([]finding, 0, len(raw)) + seen := map[string]bool{} + var prose map[string]string // built lazily — only if a finding lacks its own detail for _, sf := range raw { file := strings.TrimSpace(sf.File) if file == "" { continue } ln := 0 - if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil && n > 0 { ln = n } + key := file + ":" + strconv.Itoa(ln) + if ln > 0 { // only dedupe concrete locations; unknown-line findings are kept + if seen[key] { + continue + } + seen[key] = true + } detail := strings.TrimSpace(sf.Detail) - if detail == "" { - detail = prose[file+":"+strconv.Itoa(ln)] + if detail == "" && ln > 0 { + if prose == nil { + prose = proseParagraphs(out) + } + detail = prose[key] } title := strings.TrimSpace(sf.Title) - if title == "" { - title = truncate(detail, 120) + if title == "" { // never empty: fall back to the prose detail, then the location + if detail != "" { + title = truncate(detail, 120) + } else if ln > 0 { + title = key + } else { + title = file + } } findings = append(findings, finding{ file: file, @@ -76,7 +95,7 @@ func extractStructuredFindings(out string) ([]finding, bool) { title: title, detail: truncate(detail, 500), severity: normalizeSeverity(sf.Severity), - confidence: strings.ToLower(strings.TrimSpace(sf.Confidence)), + confidence: normalizeConfidence(sf.Confidence), }) if len(findings) >= maxFindingsPerLens { break @@ -85,58 +104,81 @@ func extractStructuredFindings(out string) ([]finding, bool) { return findings, true } -// findingsBlock returns the inner JSON text of the first ```gadfly-findings fenced -// block in out (ok=false if there is none). It scans line-by-line so it tolerates -// 3+ backticks and trailing whitespace on the fence lines. -func findingsBlock(out string) (string, bool) { +// extractStructuredFindingsOrScrape returns a lens's findings, preferring the +// structured ```gadfly-findings block and falling back to the heuristic prose +// scrape when the block is absent, unterminated/malformed, OR parsed to zero +// usable findings (e.g. an empty [] emitted alongside real prose findings). +// Factored out of emit() so the fallback rule is unit-testable. +func extractStructuredFindingsOrScrape(r specialistResult) []finding { + if fs, _ := extractStructuredFindings(r.out); len(fs) > 0 { + return fs + } + return parseFindings(r.spec, r.out) +} + +// stripFindingsBlock removes every TERMINATED ```gadfly-findings block from out so +// the machine-readable JSON never shows in the rendered comment. An UNTERMINATED +// fence is left in place — treating it as a block would swallow the rest of the +// comment (e.g. when a model's output was truncated mid-block). Trailing +// whitespace is trimmed. +func stripFindingsBlock(out string) string { lines := strings.Split(out, "\n") - start := -1 + for { + start, end, ok := findingsSpan(lines) + if !ok { + break + } + lines = append(lines[:start:start], lines[end+1:]...) + } + return strings.TrimRight(strings.Join(lines, "\n"), "\n") +} + +// findingsSpan returns the [start,end] inclusive line indices of the first +// TERMINATED ```gadfly-findings block in lines (start = the opening fence, end = +// the closing fence), or ok=false when there is none or it is unterminated. +// extract and strip share it so they always agree on what is (and isn't) a block. +func findingsSpan(lines []string) (start, end int, ok bool) { + start = -1 for i, ln := range lines { - t := strings.TrimSpace(ln) - if strings.HasPrefix(t, "```") && strings.Contains(t, findingsFence) { + if isFindingsOpen(ln) { start = i break } } if start < 0 { - return "", false + return 0, 0, false } for j := start + 1; j < len(lines); j++ { if isFenceClose(lines[j]) { - return strings.Join(lines[start+1:j], "\n"), true + return start, j, true } } - return "", false // unterminated block + return 0, 0, false // unterminated } -// stripFindingsBlock removes every ```gadfly-findings fenced block from out so the -// machine-readable JSON never shows in the rendered comment. Other content is -// preserved verbatim. -func stripFindingsBlock(out string) string { - lines := strings.Split(out, "\n") - var kept []string - skipping := false - for _, ln := range lines { - t := strings.TrimSpace(ln) - switch { - case skipping: - if isFenceClose(ln) { - skipping = false - } - case strings.HasPrefix(t, "```") && strings.Contains(t, findingsFence): - skipping = true - default: - kept = append(kept, ln) - } - } - return strings.TrimRight(strings.Join(kept, "\n"), "\n") -} - -// isFenceClose reports whether a line is a bare code-fence close (``` plus only -// whitespace), i.e. not an opening fence with an info string. -func isFenceClose(line string) bool { +// fenceInfo returns the info-string (text after the backticks) of a code-fence +// line and whether the line opens/closes a fence at all. A bare ``` yields ("", +// true); ```gadfly-findings yields ("gadfly-findings", true). +func fenceInfo(line string) (string, bool) { t := strings.TrimSpace(line) - return strings.HasPrefix(t, "```") && strings.TrimLeft(t, "`") == "" + if !strings.HasPrefix(t, "```") { + return "", false + } + return strings.TrimSpace(strings.TrimLeft(t, "`")), true +} + +// isFindingsOpen reports whether line opens a ```gadfly-findings block, matching +// the info-string EXACTLY (not as a substring) so a fence like ```not-findings +// can't masquerade as ours. +func isFindingsOpen(line string) bool { + info, ok := fenceInfo(line) + return ok && info == findingsFence +} + +// isFenceClose reports whether line is a bare closing fence (``` with no info). +func isFenceClose(line string) bool { + info, ok := fenceInfo(line) + return ok && info == "" } // proseParagraphs maps "file:line" -> the prose paragraph that first references @@ -149,7 +191,7 @@ func proseParagraphs(out string) map[string]string { m := map[string]string{} for _, loc := range pathLineRe.FindAllStringSubmatchIndex(prose, -1) { key := prose[loc[2]:loc[3]] + ":" + prose[loc[4]:loc[5]] - if _, seen := m[key]; seen { + if _, dup := m[key]; dup { continue } li := strings.Count(prose[:loc[0]], "\n") @@ -162,18 +204,33 @@ func proseParagraphs(out string) map[string]string { // (critical/high/medium/small/trivial), accepting common synonyms. An // unrecognized value is returned lowercased so the store still sees the raw word. func normalizeSeverity(s string) string { - switch strings.ToLower(strings.TrimSpace(s)) { + switch t := strings.ToLower(strings.TrimSpace(s)); t { case "critical", "crit", "blocker", "blocking": return "critical" - case "high", "major": + case "high", "major", "severe": return "high" - case "medium", "moderate", "minor": + case "medium", "moderate": return "medium" - case "small", "low", "minor-issue": + case "small", "low", "minor": return "small" - case "trivial", "nit", "nitpick", "info", "informational", "style": + case "trivial", "nit", "nitpick", "info", "informational", "style", "cosmetic": return "trivial" default: - return strings.ToLower(strings.TrimSpace(s)) + return t + } +} + +// normalizeConfidence maps a model's confidence word onto high/medium/low, +// accepting common synonyms; an unrecognized value is returned lowercased. +func normalizeConfidence(s string) string { + switch t := strings.ToLower(strings.TrimSpace(s)); t { + case "high", "certain", "confirmed", "verified": + return "high" + case "medium", "med", "moderate": + return "medium" + case "low", "unsure", "tentative", "unverified", "speculative": + return "low" + default: + return t } } diff --git a/cmd/gadfly/findings_test.go b/cmd/gadfly/findings_test.go index ac03068..c4b31f6 100644 --- a/cmd/gadfly/findings_test.go +++ b/cmd/gadfly/findings_test.go @@ -1,6 +1,9 @@ package main -import "testing" +import ( + "strings" + "testing" +) const sampleReview = "**Blocking issues found**\n\n" + "- **Unauthenticated endpoint** — `model.go:184` leaks PR content to a third party.\n" + @@ -67,15 +70,24 @@ func TestExtractStructuredFindingsStringLine(t *testing.T) { func TestStripFindingsBlock(t *testing.T) { stripped := stripFindingsBlock(sampleReview) - if got := stripped; containsFence(got) { - t.Errorf("block not stripped: %q", got) + if strings.Contains(stripped, findingsFence) { + t.Errorf("block not stripped: %q", stripped) } // The prose findings must survive. - if !contains(stripped, "Unauthenticated endpoint") || !contains(stripped, "util.go:12") { + if !strings.Contains(stripped, "Unauthenticated endpoint") || !strings.Contains(stripped, "util.go:12") { t.Errorf("prose lost during strip: %q", stripped) } } +func TestStripFindingsBlockUnterminated(t *testing.T) { + // A truncated, unterminated block must NOT swallow the prose before it. + out := "Minor issues\n\n- real finding at `x.go:1`\n\n```gadfly-findings\n[{\"file\":\"x.go\"" + got := stripFindingsBlock(out) + if !strings.Contains(got, "real finding at `x.go:1`") { + t.Errorf("unterminated block swallowed the prose: %q", got) + } +} + func TestStripFindingsBlockNoBlock(t *testing.T) { in := "Minor issues\n\n- finding at `x.go:9`" if out := stripFindingsBlock(in); out != in { @@ -87,8 +99,8 @@ func TestNormalizeSeverity(t *testing.T) { cases := map[string]string{ "Critical": "critical", "blocker": "critical", "major": "high", "HIGH": "high", - "moderate": "medium", "minor": "medium", - "low": "small", + "moderate": "medium", + "minor": "small", "low": "small", // "minor" and "low" both map to small (consistently) "nit": "trivial", "Style": "trivial", "weird": "weird", // unknown passes through, lowercased } @@ -99,13 +111,36 @@ func TestNormalizeSeverity(t *testing.T) { } } -func containsFence(s string) bool { return contains(s, findingsFence) } - -func contains(s, sub string) bool { - for i := 0; i+len(sub) <= len(s); i++ { - if s[i:i+len(sub)] == sub { - return true +func TestNormalizeConfidence(t *testing.T) { + cases := map[string]string{ + "High": "high", "confirmed": "high", + "MEDIUM": "medium", "moderate": "medium", + "low": "low", "unverified": "low", + "hunch": "hunch", // unknown passes through, lowercased + } + for in, want := range cases { + if got := normalizeConfidence(in); got != want { + t.Errorf("normalizeConfidence(%q) = %q, want %q", in, got, want) } } - return false +} + +func TestExtractStructuredFindingsFallbackOnEmpty(t *testing.T) { + // A non-clean lens that emitted an empty [] but listed prose findings must + // fall through to the heuristic scrape, not silently drop everything. + out := "Minor issues\n\n- bug at `pkg/a.go:7`\n\n```gadfly-findings\n[]\n```\n" + r := specialistResult{spec: Specialist{Name: "correctness"}, out: out, verdict: verdictMinor} + fs := extractStructuredFindingsOrScrape(r) + if len(fs) == 0 { + t.Fatal("empty [] must fall back to the heuristic scrape, got no findings") + } + if fs[0].file != "pkg/a.go" || fs[0].line != 7 { + t.Errorf("heuristic fallback wrong: %+v", fs[0]) + } +} + +func TestVerdictSeverity(t *testing.T) { + if verdictBlocking.severity() != "high" || verdictMinor.severity() != "small" || verdictUnknown.severity() != "trivial" { + t.Error("verdict.severity mapping changed unexpectedly") + } }