feat: structured findings contract (machine-readable gadfly-findings block) (#16)
Build & push image / build-and-push (push) Successful in 5s

Co-authored-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Co-committed-by: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
This commit was merged in pull request #16.
This commit is contained in:
2026-06-28 22:23:02 +00:00
committed by steve
parent f49699fc12
commit 53971603d3
7 changed files with 462 additions and 15 deletions
+19 -1
View File
@@ -28,6 +28,22 @@ func (v verdict) label() string {
} }
} }
// severity maps a verdict to a canonical severity word (the same vocabulary as
// the structured findings: critical/high/medium/small/trivial). Used as the
// raw_severity for heuristic-scraped findings, which carry no per-finding
// severity of their own — so the telemetry store sees one consistent vocabulary
// instead of a mix of canonical words and full verdict phrases.
func (v verdict) severity() string {
switch v {
case verdictBlocking:
return "high"
case verdictMinor:
return "small"
default:
return "trivial"
}
}
// parseVerdict extracts a specialist's self-reported verdict from its output. // parseVerdict extracts a specialist's self-reported verdict from its output.
// The base prompt tells each lens to lead with one of the three phrases. // The base prompt tells each lens to lead with one of the three phrases.
func parseVerdict(out string) verdict { func parseVerdict(out string) verdict {
@@ -100,7 +116,9 @@ func renderConsolidated(results []specialistResult) string {
headline, len(results), strings.Join(specialistNames(results), ", ")) headline, len(results), strings.Join(specialistNames(results), ", "))
for _, r := range results { for _, r := range results {
body := strings.TrimSpace(r.out) // Strip the machine-readable ```gadfly-findings block — it's for tooling
// (telemetry/consolidation), not for human readers of the comment.
body := strings.TrimSpace(stripFindingsBlock(r.out))
if body == "" { if body == "" {
body = "_(no output)_" body = "_(no output)_"
} }
+32 -11
View File
@@ -10,11 +10,14 @@ package main
// the review markdown consumed by run.sh) and never changes the exit code. // the review markdown consumed by run.sh) and never changes the exit code.
// - It depends only on the Go stdlib (net/http). // - It depends only on the Go stdlib (net/http).
// //
// Findings are extracted heuristically from each lens's markdown: a "path:line" // Findings come PRIMARILY from each lens's machine-readable ```gadfly-findings
// reference (e.g. run/executor.go:166) anchors a finding, whose title is the // block (findings.go: exact file/line + per-finding severity/confidence). When a
// nearest preceding markdown heading / numbered item / bold lead-in (else the // model emits no parseable block, emit falls back to a heuristic prose scrape: a
// first sentence of the finding's paragraph). This is best-effort signal for the // "path:line" reference (e.g. run/executor.go:166) anchors a finding, whose title
// store to aggregate — not a structured contract the reviewer guarantees. // is the nearest preceding markdown heading / numbered item / bold lead-in (else
// the first sentence of the finding's paragraph). The scrape is best-effort
// signal for the store to aggregate, not a structured contract the reviewer
// guarantees.
import ( import (
"bytes" "bytes"
@@ -50,11 +53,16 @@ var pathLineRe = regexp.MustCompile(`([A-Za-z0-9_./-]+\.[A-Za-z0-9]+):(\d+)`)
var numberedRe = regexp.MustCompile(`^\d+[.)]\s+(.+)$`) var numberedRe = regexp.MustCompile(`^\d+[.)]\s+(.+)$`)
// finding is one extracted issue: where it points and a derived human title. // finding is one extracted issue: where it points and a derived human title.
// severity/confidence are populated from the structured ```gadfly-findings block
// (extractStructuredFindings); they are empty for findings recovered by the
// heuristic prose scrape (parseFindings), which has no per-finding signal.
type finding struct { type finding struct {
file string file string
line int line int
title string title string
detail string detail string
severity string // per-finding: critical/high/medium/small/trivial ("" if heuristic)
confidence string // per-finding: high/medium/low ("" if heuristic)
} }
// runPayload is the POST /runs body. Field names match the gadfly-reports API EXACTLY. // runPayload is the POST /runs body. Field names match the gadfly-reports API EXACTLY.
@@ -86,6 +94,7 @@ type reportPayload struct {
Provider string `json:"provider"` Provider string `json:"provider"`
RunID string `json:"run_id"` RunID string `json:"run_id"`
RawSeverity string `json:"raw_severity"` RawSeverity string `json:"raw_severity"`
Confidence string `json:"confidence"` // per-finding high/medium/low ("" if heuristic)
Detail string `json:"detail"` Detail string `json:"detail"`
} }
@@ -129,8 +138,19 @@ func emit(results []specialistResult, elapsed time.Duration) {
if r.verdict == verdictClean { if r.verdict == verdictClean {
continue continue
} }
sev := r.verdict.label() // Prefer the model's machine-readable ```gadfly-findings block (exact
for _, f := range parseFindings(r.spec, r.out) { // location + per-finding severity). Fall back to the prose scrape when it
// yields NOTHING — no block, an unterminated/malformed one, or a block that
// parsed to zero usable findings on a non-clean lens (e.g. an empty []
// alongside prose findings). Falling back on empty, not just on absent,
// keeps the safety net from being defeated by a contradictory empty block.
fs := extractStructuredFindingsOrScrape(r)
lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity)
for _, f := range fs {
sev := f.severity
if sev == "" {
sev = lensSev
}
reports = append(reports, reportPayload{ reports = append(reports, reportPayload{
Repo: repo, Repo: repo,
PR: pr, PR: pr,
@@ -142,6 +162,7 @@ func emit(results []specialistResult, elapsed time.Duration) {
Provider: provider, Provider: provider,
RunID: runID, RunID: runID,
RawSeverity: sev, RawSeverity: sev,
Confidence: f.confidence,
Detail: f.detail, Detail: f.detail,
}) })
} }
+6 -3
View File
@@ -157,7 +157,7 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if len(reportBody) != 2 { if len(reportBody) != 2 {
t.Fatalf("/reports array length = %d, want 2", len(reportBody)) t.Fatalf("/reports array length = %d, want 2", len(reportBody))
} }
for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "detail"} { for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "confidence", "detail"} {
if _, ok := reportBody[0][k]; !ok { if _, ok := reportBody[0][k]; !ok {
t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0])) t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0]))
} }
@@ -171,8 +171,11 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if reportBody[0]["line"] != float64(166) { if reportBody[0]["line"] != float64(166) {
t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"]) t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"])
} }
if reportBody[0]["raw_severity"] != "Blocking issues found" { // No structured block in sampleLensMarkdown, so this is a heuristic-scraped
t.Errorf("reports[0].raw_severity = %v, want 'Blocking issues found'", reportBody[0]["raw_severity"]) // finding: raw_severity is the canonical word derived from the lens verdict
// (Blocking -> "high"), not the full verdict phrase.
if reportBody[0]["raw_severity"] != "high" {
t.Errorf("reports[0].raw_severity = %v, want 'high'", reportBody[0]["raw_severity"])
} }
if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" { if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" {
t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"]) t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"])
+236
View File
@@ -0,0 +1,236 @@
package main
// Structured findings: the machine-readable contract between a lens's output and
// the telemetry/consolidation pipeline. Each lens is asked (see
// scripts/system-prompt.txt) to append a fenced ```gadfly-findings code block
// holding a JSON array of its findings. Parsing that exact block is far more
// reliable than scraping prose with a path:line regex (emit.go's heuristic),
// and it carries PER-FINDING severity + confidence the prose verdict can't.
//
// Everything here degrades gracefully: a missing, unterminated, or malformed
// block makes extractStructuredFindings return ok=false (and yield no findings),
// so the caller falls back to the heuristic scrape — a weak model that ignores
// the contract still contributes findings, exactly as before.
import (
"encoding/json"
"strconv"
"strings"
)
// structuredFinding mirrors one element of the ```gadfly-findings JSON array.
// Line is json.Number so we tolerate both 123 and "123" from less-precise models.
type structuredFinding struct {
File string `json:"file"`
Line json.Number `json:"line"`
Severity string `json:"severity"`
Confidence string `json:"confidence"`
Title string `json:"title"`
Detail string `json:"detail"` // optional; the prose paragraph is used when absent
}
// findingsFence is the info-string that tags the machine-readable block.
const findingsFence = "gadfly-findings"
// extractStructuredFindings parses the ```gadfly-findings JSON block out of a
// lens's markdown. It returns the findings and ok=true when a TERMINATED block is
// present AND parses as a JSON array (an empty array is valid "nothing found" —
// ok is still true). A missing, unterminated, or unparseable block returns
// ok=false so the caller falls back to the heuristic scrape.
//
// Findings are deduped by file:line (keeping the first, matching parseFindings),
// findings with no usable file are dropped, and each title/detail is backfilled
// from the prose when the JSON omits it (best of both: exact location + the human
// context the model already wrote).
func extractStructuredFindings(out string) ([]finding, bool) {
lines := strings.Split(out, "\n")
start, end, ok := findingsSpan(lines)
if !ok {
return nil, false
}
var raw []structuredFinding
if err := json.Unmarshal([]byte(strings.Join(lines[start+1:end], "\n")), &raw); err != nil {
return nil, false
}
findings := make([]finding, 0, len(raw))
seen := map[string]bool{}
var prose map[string]string // built lazily — only if a finding lacks its own detail
for _, sf := range raw {
file := strings.TrimSpace(sf.File)
if file == "" {
continue
}
ln := 0
if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil && n > 0 {
ln = n
}
key := file + ":" + strconv.Itoa(ln)
if ln > 0 { // only dedupe concrete locations; unknown-line findings are kept
if seen[key] {
continue
}
seen[key] = true
}
detail := strings.TrimSpace(sf.Detail)
if detail == "" && ln > 0 {
if prose == nil {
prose = proseParagraphs(out)
}
detail = prose[key]
}
title := strings.TrimSpace(sf.Title)
if title == "" { // never empty: fall back to the prose detail, then the location
if detail != "" {
title = truncate(detail, 120)
} else if ln > 0 {
title = key
} else {
title = file
}
}
findings = append(findings, finding{
file: file,
line: ln,
title: title,
detail: truncate(detail, 500),
severity: normalizeSeverity(sf.Severity),
confidence: normalizeConfidence(sf.Confidence),
})
if len(findings) >= maxFindingsPerLens {
break
}
}
return findings, true
}
// extractStructuredFindingsOrScrape returns a lens's findings, preferring the
// structured ```gadfly-findings block and falling back to the heuristic prose
// scrape when the block is absent, unterminated/malformed, OR parsed to zero
// usable findings (e.g. an empty [] emitted alongside real prose findings).
// Factored out of emit() so the fallback rule is unit-testable.
func extractStructuredFindingsOrScrape(r specialistResult) []finding {
if fs, _ := extractStructuredFindings(r.out); len(fs) > 0 {
return fs
}
return parseFindings(r.spec, r.out)
}
// stripFindingsBlock removes every TERMINATED ```gadfly-findings block from out so
// the machine-readable JSON never shows in the rendered comment. An UNTERMINATED
// fence is left in place — treating it as a block would swallow the rest of the
// comment (e.g. when a model's output was truncated mid-block). Trailing
// whitespace is trimmed.
func stripFindingsBlock(out string) string {
lines := strings.Split(out, "\n")
for {
start, end, ok := findingsSpan(lines)
if !ok {
break
}
lines = append(lines[:start:start], lines[end+1:]...)
}
return strings.TrimRight(strings.Join(lines, "\n"), "\n")
}
// findingsSpan returns the [start,end] inclusive line indices of the first
// TERMINATED ```gadfly-findings block in lines (start = the opening fence, end =
// the closing fence), or ok=false when there is none or it is unterminated.
// extract and strip share it so they always agree on what is (and isn't) a block.
func findingsSpan(lines []string) (start, end int, ok bool) {
start = -1
for i, ln := range lines {
if isFindingsOpen(ln) {
start = i
break
}
}
if start < 0 {
return 0, 0, false
}
for j := start + 1; j < len(lines); j++ {
if isFenceClose(lines[j]) {
return start, j, true
}
}
return 0, 0, false // unterminated
}
// fenceInfo returns the info-string (text after the backticks) of a code-fence
// line and whether the line opens/closes a fence at all. A bare ``` yields ("",
// true); ```gadfly-findings yields ("gadfly-findings", true).
func fenceInfo(line string) (string, bool) {
t := strings.TrimSpace(line)
if !strings.HasPrefix(t, "```") {
return "", false
}
return strings.TrimSpace(strings.TrimLeft(t, "`")), true
}
// isFindingsOpen reports whether line opens a ```gadfly-findings block, matching
// the info-string EXACTLY (not as a substring) so a fence like ```not-findings
// can't masquerade as ours.
func isFindingsOpen(line string) bool {
info, ok := fenceInfo(line)
return ok && info == findingsFence
}
// isFenceClose reports whether line is a bare closing fence (``` with no info).
func isFenceClose(line string) bool {
info, ok := fenceInfo(line)
return ok && info == ""
}
// proseParagraphs maps "file:line" -> the prose paragraph that first references
// it, so a structured finding without its own detail can borrow the human
// context the model already wrote. Built from the markdown OUTSIDE the findings
// block (the block is JSON, not prose).
func proseParagraphs(out string) map[string]string {
prose := stripFindingsBlock(out)
lines := strings.Split(prose, "\n")
m := map[string]string{}
for _, loc := range pathLineRe.FindAllStringSubmatchIndex(prose, -1) {
key := prose[loc[2]:loc[3]] + ":" + prose[loc[4]:loc[5]]
if _, dup := m[key]; dup {
continue
}
li := strings.Count(prose[:loc[0]], "\n")
m[key] = paragraphAt(lines, li)
}
return m
}
// normalizeSeverity maps a model's severity word onto the canonical set
// (critical/high/medium/small/trivial), accepting common synonyms. An
// unrecognized value is returned lowercased so the store still sees the raw word.
func normalizeSeverity(s string) string {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "critical", "crit", "blocker", "blocking":
return "critical"
case "high", "major", "severe":
return "high"
case "medium", "moderate":
return "medium"
case "small", "low", "minor":
return "small"
case "trivial", "nit", "nitpick", "info", "informational", "style", "cosmetic":
return "trivial"
default:
return t
}
}
// normalizeConfidence maps a model's confidence word onto high/medium/low,
// accepting common synonyms; an unrecognized value is returned lowercased.
func normalizeConfidence(s string) string {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "high", "certain", "confirmed", "verified":
return "high"
case "medium", "med", "moderate":
return "medium"
case "low", "unsure", "tentative", "unverified", "speculative":
return "low"
default:
return t
}
}
+146
View File
@@ -0,0 +1,146 @@
package main
import (
"strings"
"testing"
)
const sampleReview = "**Blocking issues found**\n\n" +
"- **Unauthenticated endpoint** — `model.go:184` leaks PR content to a third party.\n" +
"- A nit about naming in `util.go:12`.\n\n" +
"```gadfly-findings\n" +
"[\n" +
" {\"file\": \"model.go\", \"line\": 184, \"severity\": \"high\", \"confidence\": \"high\", \"title\": \"Unauthenticated endpoint\"},\n" +
" {\"file\": \"util.go\", \"line\": 12, \"severity\": \"nit\", \"confidence\": \"medium\", \"title\": \"naming\"}\n" +
"]\n" +
"```\n"
func TestExtractStructuredFindings(t *testing.T) {
fs, ok := extractStructuredFindings(sampleReview)
if !ok {
t.Fatal("expected ok=true for a well-formed block")
}
if len(fs) != 2 {
t.Fatalf("want 2 findings, got %d", len(fs))
}
if fs[0].file != "model.go" || fs[0].line != 184 || fs[0].severity != "high" || fs[0].confidence != "high" {
t.Errorf("finding[0] mismatch: %+v", fs[0])
}
// "nit" must normalize to the canonical "trivial".
if fs[1].severity != "trivial" {
t.Errorf("want severity normalized to trivial, got %q", fs[1].severity)
}
// Detail is borrowed from the prose paragraph referencing the same file:line.
if fs[0].detail == "" {
t.Error("expected detail borrowed from prose, got empty")
}
}
func TestExtractStructuredFindingsEmptyArray(t *testing.T) {
out := "No material issues found\n\n```gadfly-findings\n[]\n```\n"
fs, ok := extractStructuredFindings(out)
if !ok {
t.Fatal("an empty array is a valid block; want ok=true")
}
if len(fs) != 0 {
t.Fatalf("want 0 findings, got %d", len(fs))
}
}
func TestExtractStructuredFindingsFallback(t *testing.T) {
// No block at all -> ok=false so the caller uses the heuristic scrape.
if _, ok := extractStructuredFindings("Minor issues\n\n- something at `x.go:1`\n"); ok {
t.Error("want ok=false when there is no block")
}
// Malformed JSON -> ok=false (graceful fallback).
bad := "Minor issues\n\n```gadfly-findings\n{not json}\n```\n"
if _, ok := extractStructuredFindings(bad); ok {
t.Error("want ok=false for malformed JSON")
}
}
func TestExtractStructuredFindingsStringLine(t *testing.T) {
// Tolerate a quoted line number from a less-precise model.
out := "Minor issues\n\n```gadfly-findings\n[{\"file\":\"a.go\",\"line\":\"42\",\"severity\":\"medium\",\"title\":\"x\"}]\n```\n"
fs, ok := extractStructuredFindings(out)
if !ok || len(fs) != 1 || fs[0].line != 42 {
t.Fatalf("want one finding at line 42, got ok=%v %+v", ok, fs)
}
}
func TestStripFindingsBlock(t *testing.T) {
stripped := stripFindingsBlock(sampleReview)
if strings.Contains(stripped, findingsFence) {
t.Errorf("block not stripped: %q", stripped)
}
// The prose findings must survive.
if !strings.Contains(stripped, "Unauthenticated endpoint") || !strings.Contains(stripped, "util.go:12") {
t.Errorf("prose lost during strip: %q", stripped)
}
}
func TestStripFindingsBlockUnterminated(t *testing.T) {
// A truncated, unterminated block must NOT swallow the prose before it.
out := "Minor issues\n\n- real finding at `x.go:1`\n\n```gadfly-findings\n[{\"file\":\"x.go\""
got := stripFindingsBlock(out)
if !strings.Contains(got, "real finding at `x.go:1`") {
t.Errorf("unterminated block swallowed the prose: %q", got)
}
}
func TestStripFindingsBlockNoBlock(t *testing.T) {
in := "Minor issues\n\n- finding at `x.go:9`"
if out := stripFindingsBlock(in); out != in {
t.Errorf("strip changed block-free text: %q != %q", out, in)
}
}
func TestNormalizeSeverity(t *testing.T) {
cases := map[string]string{
"Critical": "critical", "blocker": "critical",
"major": "high", "HIGH": "high",
"moderate": "medium",
"minor": "small", "low": "small", // "minor" and "low" both map to small (consistently)
"nit": "trivial", "Style": "trivial",
"weird": "weird", // unknown passes through, lowercased
}
for in, want := range cases {
if got := normalizeSeverity(in); got != want {
t.Errorf("normalizeSeverity(%q) = %q, want %q", in, got, want)
}
}
}
func TestNormalizeConfidence(t *testing.T) {
cases := map[string]string{
"High": "high", "confirmed": "high",
"MEDIUM": "medium", "moderate": "medium",
"low": "low", "unverified": "low",
"hunch": "hunch", // unknown passes through, lowercased
}
for in, want := range cases {
if got := normalizeConfidence(in); got != want {
t.Errorf("normalizeConfidence(%q) = %q, want %q", in, got, want)
}
}
}
func TestExtractStructuredFindingsFallbackOnEmpty(t *testing.T) {
// A non-clean lens that emitted an empty [] but listed prose findings must
// fall through to the heuristic scrape, not silently drop everything.
out := "Minor issues\n\n- bug at `pkg/a.go:7`\n\n```gadfly-findings\n[]\n```\n"
r := specialistResult{spec: Specialist{Name: "correctness"}, out: out, verdict: verdictMinor}
fs := extractStructuredFindingsOrScrape(r)
if len(fs) == 0 {
t.Fatal("empty [] must fall back to the heuristic scrape, got no findings")
}
if fs[0].file != "pkg/a.go" || fs[0].line != 7 {
t.Errorf("heuristic fallback wrong: %+v", fs[0])
}
}
func TestVerdictSeverity(t *testing.T) {
if verdictBlocking.severity() != "high" || verdictMinor.severity() != "small" || verdictUnknown.severity() != "trivial" {
t.Error("verdict.severity mapping changed unexpectedly")
}
}
+5
View File
@@ -49,6 +49,11 @@ Output rules:
- Do NOT invent new findings; this is a verification gate, not a fresh review. - Do NOT invent new findings; this is a verification gate, not a fresh review.
- Do NOT include meta-commentary about the verification process or which - Do NOT include meta-commentary about the verification process or which
findings you dropped — output only the final, corrected review markdown. findings you dropped — output only the final, corrected review markdown.
- The draft ends with a fenced ` + "`gadfly-findings`" + ` JSON block. Regenerate it
so it lists ONLY the findings that SURVIVED your verification, in the same schema
({"file","line","severity","confidence","title"}; severity one of
critical/high/medium/small/trivial, confidence one of high/medium/low). If every
finding was dropped, emit an empty array ` + "`[]`" + `. Keep the block last.
- When done investigating, STOP calling tools and reply with the review.` - When done investigating, STOP calling tools and reply with the review.`
// recheckEnabled reports whether the verification pass should run. On unless // recheckEnabled reports whether the verification pass should run. On unless
+18
View File
@@ -43,3 +43,21 @@ Output rules:
- Only report issues you are reasonably confident are real after checking. If the diff - Only report issues you are reasonably confident are real after checking. If the diff
is clean, say so plainly rather than inventing nits. is clean, say so plainly rather than inventing nits.
- When you are done investigating, STOP calling tools and reply with the final review. - When you are done investigating, STOP calling tools and reply with the final review.
Machine-readable findings — AFTER the prose review, append ONE fenced code block,
tagged `gadfly-findings`, holding a JSON array of the SAME findings you described above
(this block is consumed by tooling and hidden from the rendered comment):
```gadfly-findings
[
{"file": "path/to/file.go", "line": 123, "severity": "high", "confidence": "high", "title": "one-line summary of the issue"}
]
```
- One object per real finding, in the same order as your prose. `file`/`line` must be a
concrete location you verified (the line the issue is at). `severity` is one of
`critical`, `high`, `medium`, `small`, `trivial`. `confidence` is your post-verification
confidence the issue is real: one of `high`, `medium`, `low`.
- Include ONLY genuine problems — never verification notes ("confirmed X is safe at f:line"),
and never an "Outside my lens:" aside. If your lens is clean, emit an empty array `[]`.
- This block is in ADDITION to the prose; do not drop the human-readable findings.