fix: address swarm review of the structured-findings contract
Build & push image / build-and-push (pull_request) Successful in 5s

From PR #16's own adversarial review (graded in gadfly-reports):

- Empty/empty-result fallback: emit now scrapes the prose when the structured
  block is absent, unterminated, malformed, OR parses to zero usable findings
  (e.g. an empty [] emitted alongside real prose) — the safety net is no longer
  defeated by a contradictory empty block. (extractStructuredFindingsOrScrape)
- Unterminated fence no longer swallows the comment: extract + strip share
  findingsSpan, which only recognizes a TERMINATED block; an unterminated fence
  is left in place.
- confidence is now consumed: normalized (normalizeConfidence) and emitted to
  the telemetry store (reportPayload.confidence), not write-only.
- Canonical raw_severity everywhere: heuristic findings derive a canonical word
  from the verdict (verdict.severity) instead of the full verdict phrase.
- normalizeSeverity table made consistent ("minor"/"low" -> small).
- Exact info-string match (isFindingsOpen) instead of substring; shared
  fenceInfo predicate removes the duplicated open/close fence logic.
- Dedupe structured findings by file:line; never emit an empty title (fall back
  to detail, then the location); validate the parsed line number.
- Skip the prose scan when findings already carry detail; refresh stale
  emit.go doc comments.

Tests added for the empty-[] fallback, unterminated-fence strip, confidence
normalization, and verdict.severity. All green, gofmt clean, vet quiet.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-28 18:22:43 -04:00
parent 9bcd7b9696
commit 66682aa1d5
5 changed files with 210 additions and 94 deletions
+16
View File
@@ -28,6 +28,22 @@ func (v verdict) label() string {
}
}
// severity maps a verdict to a canonical severity word (the same vocabulary as
// the structured findings: critical/high/medium/small/trivial). Used as the
// raw_severity for heuristic-scraped findings, which carry no per-finding
// severity of their own — so the telemetry store sees one consistent vocabulary
// instead of a mix of canonical words and full verdict phrases.
func (v verdict) severity() string {
switch v {
case verdictBlocking:
return "high"
case verdictMinor:
return "small"
default:
return "trivial"
}
}
// parseVerdict extracts a specialist's self-reported verdict from its output.
// The base prompt tells each lens to lead with one of the three phrases.
func parseVerdict(out string) verdict {
+18 -13
View File
@@ -10,11 +10,14 @@ package main
// the review markdown consumed by run.sh) and never changes the exit code.
// - It depends only on the Go stdlib (net/http).
//
// Findings are extracted heuristically from each lens's markdown: a "path:line"
// reference (e.g. run/executor.go:166) anchors a finding, whose title is the
// nearest preceding markdown heading / numbered item / bold lead-in (else the
// first sentence of the finding's paragraph). This is best-effort signal for the
// store to aggregate — not a structured contract the reviewer guarantees.
// Findings come PRIMARILY from each lens's machine-readable ```gadfly-findings
// block (findings.go: exact file/line + per-finding severity/confidence). When a
// model emits no parseable block, emit falls back to a heuristic prose scrape: a
// "path:line" reference (e.g. run/executor.go:166) anchors a finding, whose title
// is the nearest preceding markdown heading / numbered item / bold lead-in (else
// the first sentence of the finding's paragraph). The scrape is best-effort
// signal for the store to aggregate, not a structured contract the reviewer
// guarantees.
import (
"bytes"
@@ -91,6 +94,7 @@ type reportPayload struct {
Provider string `json:"provider"`
RunID string `json:"run_id"`
RawSeverity string `json:"raw_severity"`
Confidence string `json:"confidence"` // per-finding high/medium/low ("" if heuristic)
Detail string `json:"detail"`
}
@@ -135,17 +139,17 @@ func emit(results []specialistResult, elapsed time.Duration) {
continue
}
// Prefer the model's machine-readable ```gadfly-findings block (exact
// location + per-finding severity); fall back to the prose scrape when a
// model didn't emit a parseable one.
fs, structured := extractStructuredFindings(r.out)
if !structured {
fs = parseFindings(r.spec, r.out)
}
lensSev := r.verdict.label()
// location + per-finding severity). Fall back to the prose scrape when it
// yields NOTHING — no block, an unterminated/malformed one, or a block that
// parsed to zero usable findings on a non-clean lens (e.g. an empty []
// alongside prose findings). Falling back on empty, not just on absent,
// keeps the safety net from being defeated by a contradictory empty block.
fs := extractStructuredFindingsOrScrape(r)
lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity)
for _, f := range fs {
sev := f.severity
if sev == "" {
sev = lensSev // heuristic finding: best signal is the lens verdict
sev = lensSev
}
reports = append(reports, reportPayload{
Repo: repo,
@@ -158,6 +162,7 @@ func emit(results []specialistResult, elapsed time.Duration) {
Provider: provider,
RunID: runID,
RawSeverity: sev,
Confidence: f.confidence,
Detail: f.detail,
})
}
+6 -3
View File
@@ -157,7 +157,7 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if len(reportBody) != 2 {
t.Fatalf("/reports array length = %d, want 2", len(reportBody))
}
for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "detail"} {
for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "confidence", "detail"} {
if _, ok := reportBody[0][k]; !ok {
t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0]))
}
@@ -171,8 +171,11 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if reportBody[0]["line"] != float64(166) {
t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"])
}
if reportBody[0]["raw_severity"] != "Blocking issues found" {
t.Errorf("reports[0].raw_severity = %v, want 'Blocking issues found'", reportBody[0]["raw_severity"])
// No structured block in sampleLensMarkdown, so this is a heuristic-scraped
// finding: raw_severity is the canonical word derived from the lens verdict
// (Blocking -> "high"), not the full verdict phrase.
if reportBody[0]["raw_severity"] != "high" {
t.Errorf("reports[0].raw_severity = %v, want 'high'", reportBody[0]["raw_severity"])
}
if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" {
t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"])
+122 -65
View File
@@ -7,10 +7,10 @@ package main
// reliable than scraping prose with a path:line regex (emit.go's heuristic),
// and it carries PER-FINDING severity + confidence the prose verdict can't.
//
// Everything here degrades gracefully: a missing or malformed block makes
// extractStructuredFindings return ok=false, and the caller falls back to the
// heuristic scrape — so a weak model that ignores the contract still contributes
// findings, exactly as before.
// Everything here degrades gracefully: a missing, unterminated, or malformed
// block makes extractStructuredFindings return ok=false (and yield no findings),
// so the caller falls back to the heuristic scrape — a weak model that ignores
// the contract still contributes findings, exactly as before.
import (
"encoding/json"
@@ -26,49 +26,68 @@ type structuredFinding struct {
Severity string `json:"severity"`
Confidence string `json:"confidence"`
Title string `json:"title"`
Detail string `json:"detail"` // optional; prose paragraph is used when absent
Detail string `json:"detail"` // optional; the prose paragraph is used when absent
}
// findingsFence is the info-string that tags the machine-readable block.
const findingsFence = "gadfly-findings"
// extractStructuredFindings parses the ```gadfly-findings JSON block out of a
// lens's markdown. It returns the findings and ok=true when a block is present
// AND parses as a JSON array (an empty array is a valid "nothing found" — ok is
// still true so the caller does NOT fall back to the regex scrape). A missing or
// unparseable block returns ok=false, signalling the caller to use the heuristic.
// lens's markdown. It returns the findings and ok=true when a TERMINATED block is
// present AND parses as a JSON array (an empty array is valid "nothing found" —
// ok is still true). A missing, unterminated, or unparseable block returns
// ok=false so the caller falls back to the heuristic scrape.
//
// detail for each finding is filled from the JSON when present, else from the
// prose paragraph that references the same file:line (best of both: exact
// location + rich human context). Findings with no usable file are dropped.
// Findings are deduped by file:line (keeping the first, matching parseFindings),
// findings with no usable file are dropped, and each title/detail is backfilled
// from the prose when the JSON omits it (best of both: exact location + the human
// context the model already wrote).
func extractStructuredFindings(out string) ([]finding, bool) {
body, ok := findingsBlock(out)
lines := strings.Split(out, "\n")
start, end, ok := findingsSpan(lines)
if !ok {
return nil, false
}
var raw []structuredFinding
if err := json.Unmarshal([]byte(body), &raw); err != nil {
if err := json.Unmarshal([]byte(strings.Join(lines[start+1:end], "\n")), &raw); err != nil {
return nil, false
}
prose := proseParagraphs(out)
findings := make([]finding, 0, len(raw))
seen := map[string]bool{}
var prose map[string]string // built lazily — only if a finding lacks its own detail
for _, sf := range raw {
file := strings.TrimSpace(sf.File)
if file == "" {
continue
}
ln := 0
if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil {
if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil && n > 0 {
ln = n
}
key := file + ":" + strconv.Itoa(ln)
if ln > 0 { // only dedupe concrete locations; unknown-line findings are kept
if seen[key] {
continue
}
seen[key] = true
}
detail := strings.TrimSpace(sf.Detail)
if detail == "" {
detail = prose[file+":"+strconv.Itoa(ln)]
if detail == "" && ln > 0 {
if prose == nil {
prose = proseParagraphs(out)
}
detail = prose[key]
}
title := strings.TrimSpace(sf.Title)
if title == "" {
title = truncate(detail, 120)
if title == "" { // never empty: fall back to the prose detail, then the location
if detail != "" {
title = truncate(detail, 120)
} else if ln > 0 {
title = key
} else {
title = file
}
}
findings = append(findings, finding{
file: file,
@@ -76,7 +95,7 @@ func extractStructuredFindings(out string) ([]finding, bool) {
title: title,
detail: truncate(detail, 500),
severity: normalizeSeverity(sf.Severity),
confidence: strings.ToLower(strings.TrimSpace(sf.Confidence)),
confidence: normalizeConfidence(sf.Confidence),
})
if len(findings) >= maxFindingsPerLens {
break
@@ -85,58 +104,81 @@ func extractStructuredFindings(out string) ([]finding, bool) {
return findings, true
}
// findingsBlock returns the inner JSON text of the first ```gadfly-findings fenced
// block in out (ok=false if there is none). It scans line-by-line so it tolerates
// 3+ backticks and trailing whitespace on the fence lines.
func findingsBlock(out string) (string, bool) {
// extractStructuredFindingsOrScrape returns a lens's findings, preferring the
// structured ```gadfly-findings block and falling back to the heuristic prose
// scrape when the block is absent, unterminated/malformed, OR parsed to zero
// usable findings (e.g. an empty [] emitted alongside real prose findings).
// Factored out of emit() so the fallback rule is unit-testable.
func extractStructuredFindingsOrScrape(r specialistResult) []finding {
if fs, _ := extractStructuredFindings(r.out); len(fs) > 0 {
return fs
}
return parseFindings(r.spec, r.out)
}
// stripFindingsBlock removes every TERMINATED ```gadfly-findings block from out so
// the machine-readable JSON never shows in the rendered comment. An UNTERMINATED
// fence is left in place — treating it as a block would swallow the rest of the
// comment (e.g. when a model's output was truncated mid-block). Trailing
// whitespace is trimmed.
func stripFindingsBlock(out string) string {
lines := strings.Split(out, "\n")
start := -1
for {
start, end, ok := findingsSpan(lines)
if !ok {
break
}
lines = append(lines[:start:start], lines[end+1:]...)
}
return strings.TrimRight(strings.Join(lines, "\n"), "\n")
}
// findingsSpan returns the [start,end] inclusive line indices of the first
// TERMINATED ```gadfly-findings block in lines (start = the opening fence, end =
// the closing fence), or ok=false when there is none or it is unterminated.
// extract and strip share it so they always agree on what is (and isn't) a block.
func findingsSpan(lines []string) (start, end int, ok bool) {
start = -1
for i, ln := range lines {
t := strings.TrimSpace(ln)
if strings.HasPrefix(t, "```") && strings.Contains(t, findingsFence) {
if isFindingsOpen(ln) {
start = i
break
}
}
if start < 0 {
return "", false
return 0, 0, false
}
for j := start + 1; j < len(lines); j++ {
if isFenceClose(lines[j]) {
return strings.Join(lines[start+1:j], "\n"), true
return start, j, true
}
}
return "", false // unterminated block
return 0, 0, false // unterminated
}
// stripFindingsBlock removes every ```gadfly-findings fenced block from out so the
// machine-readable JSON never shows in the rendered comment. Other content is
// preserved verbatim.
func stripFindingsBlock(out string) string {
lines := strings.Split(out, "\n")
var kept []string
skipping := false
for _, ln := range lines {
t := strings.TrimSpace(ln)
switch {
case skipping:
if isFenceClose(ln) {
skipping = false
}
case strings.HasPrefix(t, "```") && strings.Contains(t, findingsFence):
skipping = true
default:
kept = append(kept, ln)
}
}
return strings.TrimRight(strings.Join(kept, "\n"), "\n")
}
// isFenceClose reports whether a line is a bare code-fence close (``` plus only
// whitespace), i.e. not an opening fence with an info string.
func isFenceClose(line string) bool {
// fenceInfo returns the info-string (text after the backticks) of a code-fence
// line and whether the line opens/closes a fence at all. A bare ``` yields ("",
// true); ```gadfly-findings yields ("gadfly-findings", true).
func fenceInfo(line string) (string, bool) {
t := strings.TrimSpace(line)
return strings.HasPrefix(t, "```") && strings.TrimLeft(t, "`") == ""
if !strings.HasPrefix(t, "```") {
return "", false
}
return strings.TrimSpace(strings.TrimLeft(t, "`")), true
}
// isFindingsOpen reports whether line opens a ```gadfly-findings block, matching
// the info-string EXACTLY (not as a substring) so a fence like ```not-findings
// can't masquerade as ours.
func isFindingsOpen(line string) bool {
info, ok := fenceInfo(line)
return ok && info == findingsFence
}
// isFenceClose reports whether line is a bare closing fence (``` with no info).
func isFenceClose(line string) bool {
info, ok := fenceInfo(line)
return ok && info == ""
}
// proseParagraphs maps "file:line" -> the prose paragraph that first references
@@ -149,7 +191,7 @@ func proseParagraphs(out string) map[string]string {
m := map[string]string{}
for _, loc := range pathLineRe.FindAllStringSubmatchIndex(prose, -1) {
key := prose[loc[2]:loc[3]] + ":" + prose[loc[4]:loc[5]]
if _, seen := m[key]; seen {
if _, dup := m[key]; dup {
continue
}
li := strings.Count(prose[:loc[0]], "\n")
@@ -162,18 +204,33 @@ func proseParagraphs(out string) map[string]string {
// (critical/high/medium/small/trivial), accepting common synonyms. An
// unrecognized value is returned lowercased so the store still sees the raw word.
func normalizeSeverity(s string) string {
switch strings.ToLower(strings.TrimSpace(s)) {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "critical", "crit", "blocker", "blocking":
return "critical"
case "high", "major":
case "high", "major", "severe":
return "high"
case "medium", "moderate", "minor":
case "medium", "moderate":
return "medium"
case "small", "low", "minor-issue":
case "small", "low", "minor":
return "small"
case "trivial", "nit", "nitpick", "info", "informational", "style":
case "trivial", "nit", "nitpick", "info", "informational", "style", "cosmetic":
return "trivial"
default:
return strings.ToLower(strings.TrimSpace(s))
return t
}
}
// normalizeConfidence maps a model's confidence word onto high/medium/low,
// accepting common synonyms; an unrecognized value is returned lowercased.
func normalizeConfidence(s string) string {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "high", "certain", "confirmed", "verified":
return "high"
case "medium", "med", "moderate":
return "medium"
case "low", "unsure", "tentative", "unverified", "speculative":
return "low"
default:
return t
}
}
+48 -13
View File
@@ -1,6 +1,9 @@
package main
import "testing"
import (
"strings"
"testing"
)
const sampleReview = "**Blocking issues found**\n\n" +
"- **Unauthenticated endpoint** — `model.go:184` leaks PR content to a third party.\n" +
@@ -67,15 +70,24 @@ func TestExtractStructuredFindingsStringLine(t *testing.T) {
func TestStripFindingsBlock(t *testing.T) {
stripped := stripFindingsBlock(sampleReview)
if got := stripped; containsFence(got) {
t.Errorf("block not stripped: %q", got)
if strings.Contains(stripped, findingsFence) {
t.Errorf("block not stripped: %q", stripped)
}
// The prose findings must survive.
if !contains(stripped, "Unauthenticated endpoint") || !contains(stripped, "util.go:12") {
if !strings.Contains(stripped, "Unauthenticated endpoint") || !strings.Contains(stripped, "util.go:12") {
t.Errorf("prose lost during strip: %q", stripped)
}
}
func TestStripFindingsBlockUnterminated(t *testing.T) {
// A truncated, unterminated block must NOT swallow the prose before it.
out := "Minor issues\n\n- real finding at `x.go:1`\n\n```gadfly-findings\n[{\"file\":\"x.go\""
got := stripFindingsBlock(out)
if !strings.Contains(got, "real finding at `x.go:1`") {
t.Errorf("unterminated block swallowed the prose: %q", got)
}
}
func TestStripFindingsBlockNoBlock(t *testing.T) {
in := "Minor issues\n\n- finding at `x.go:9`"
if out := stripFindingsBlock(in); out != in {
@@ -87,8 +99,8 @@ func TestNormalizeSeverity(t *testing.T) {
cases := map[string]string{
"Critical": "critical", "blocker": "critical",
"major": "high", "HIGH": "high",
"moderate": "medium", "minor": "medium",
"low": "small",
"moderate": "medium",
"minor": "small", "low": "small", // "minor" and "low" both map to small (consistently)
"nit": "trivial", "Style": "trivial",
"weird": "weird", // unknown passes through, lowercased
}
@@ -99,13 +111,36 @@ func TestNormalizeSeverity(t *testing.T) {
}
}
func containsFence(s string) bool { return contains(s, findingsFence) }
func contains(s, sub string) bool {
for i := 0; i+len(sub) <= len(s); i++ {
if s[i:i+len(sub)] == sub {
return true
func TestNormalizeConfidence(t *testing.T) {
cases := map[string]string{
"High": "high", "confirmed": "high",
"MEDIUM": "medium", "moderate": "medium",
"low": "low", "unverified": "low",
"hunch": "hunch", // unknown passes through, lowercased
}
for in, want := range cases {
if got := normalizeConfidence(in); got != want {
t.Errorf("normalizeConfidence(%q) = %q, want %q", in, got, want)
}
}
return false
}
func TestExtractStructuredFindingsFallbackOnEmpty(t *testing.T) {
// A non-clean lens that emitted an empty [] but listed prose findings must
// fall through to the heuristic scrape, not silently drop everything.
out := "Minor issues\n\n- bug at `pkg/a.go:7`\n\n```gadfly-findings\n[]\n```\n"
r := specialistResult{spec: Specialist{Name: "correctness"}, out: out, verdict: verdictMinor}
fs := extractStructuredFindingsOrScrape(r)
if len(fs) == 0 {
t.Fatal("empty [] must fall back to the heuristic scrape, got no findings")
}
if fs[0].file != "pkg/a.go" || fs[0].line != 7 {
t.Errorf("heuristic fallback wrong: %+v", fs[0])
}
}
func TestVerdictSeverity(t *testing.T) {
if verdictBlocking.severity() != "high" || verdictMinor.severity() != "small" || verdictUnknown.severity() != "trivial" {
t.Error("verdict.severity mapping changed unexpectedly")
}
}