feat: structured findings contract (machine-readable gadfly-findings block) #16

Merged
steve merged 2 commits from feat/structured-findings into main 2026-06-28 22:23:02 +00:00
7 changed files with 462 additions and 15 deletions
+19 -1
View File
@@ -28,6 +28,22 @@ func (v verdict) label() string {
}
}
// severity maps a verdict to a canonical severity word (the same vocabulary as
// the structured findings: critical/high/medium/small/trivial). Used as the
// raw_severity for heuristic-scraped findings, which carry no per-finding
// severity of their own — so the telemetry store sees one consistent vocabulary
// instead of a mix of canonical words and full verdict phrases.
func (v verdict) severity() string {
switch v {
case verdictBlocking:
return "high"
case verdictMinor:
return "small"
default:
return "trivial"
}
}
// parseVerdict extracts a specialist's self-reported verdict from its output.
// The base prompt tells each lens to lead with one of the three phrases.
func parseVerdict(out string) verdict {
@@ -100,7 +116,9 @@ func renderConsolidated(results []specialistResult) string {
headline, len(results), strings.Join(specialistNames(results), ", "))
for _, r := range results {
body := strings.TrimSpace(r.out)
// Strip the machine-readable ```gadfly-findings block — it's for tooling
// (telemetry/consolidation), not for human readers of the comment.
body := strings.TrimSpace(stripFindingsBlock(r.out))
if body == "" {
body = "_(no output)_"
}
+28 -7
View File
@@ -10,11 +10,14 @@ package main
// the review markdown consumed by run.sh) and never changes the exit code.
// - It depends only on the Go stdlib (net/http).
//
// Findings are extracted heuristically from each lens's markdown: a "path:line"
// reference (e.g. run/executor.go:166) anchors a finding, whose title is the
// nearest preceding markdown heading / numbered item / bold lead-in (else the
// first sentence of the finding's paragraph). This is best-effort signal for the
// store to aggregate — not a structured contract the reviewer guarantees.
// Findings come PRIMARILY from each lens's machine-readable ```gadfly-findings
// block (findings.go: exact file/line + per-finding severity/confidence). When a
// model emits no parseable block, emit falls back to a heuristic prose scrape: a
// "path:line" reference (e.g. run/executor.go:166) anchors a finding, whose title
// is the nearest preceding markdown heading / numbered item / bold lead-in (else
// the first sentence of the finding's paragraph). The scrape is best-effort
// signal for the store to aggregate, not a structured contract the reviewer
// guarantees.
import (
"bytes"
@@ -50,11 +53,16 @@ var pathLineRe = regexp.MustCompile(`([A-Za-z0-9_./-]+\.[A-Za-z0-9]+):(\d+)`)
var numberedRe = regexp.MustCompile(`^\d+[.)]\s+(.+)$`)
// finding is one extracted issue: where it points and a derived human title.
// severity/confidence are populated from the structured ```gadfly-findings block
// (extractStructuredFindings); they are empty for findings recovered by the
// heuristic prose scrape (parseFindings), which has no per-finding signal.
type finding struct {
file string
line int
title string
detail string
severity string // per-finding: critical/high/medium/small/trivial ("" if heuristic)
confidence string // per-finding: high/medium/low ("" if heuristic)
}
// runPayload is the POST /runs body. Field names match the gadfly-reports API EXACTLY.
@@ -86,6 +94,7 @@ type reportPayload struct {
Provider string `json:"provider"`
RunID string `json:"run_id"`
RawSeverity string `json:"raw_severity"`
Confidence string `json:"confidence"` // per-finding high/medium/low ("" if heuristic)
Detail string `json:"detail"`
}
@@ -129,8 +138,19 @@ func emit(results []specialistResult, elapsed time.Duration) {
if r.verdict == verdictClean {
continue
}
sev := r.verdict.label()
for _, f := range parseFindings(r.spec, r.out) {
// Prefer the model's machine-readable ```gadfly-findings block (exact
// location + per-finding severity). Fall back to the prose scrape when it
// yields NOTHING — no block, an unterminated/malformed one, or a block that
// parsed to zero usable findings on a non-clean lens (e.g. an empty []
// alongside prose findings). Falling back on empty, not just on absent,
// keeps the safety net from being defeated by a contradictory empty block.
fs := extractStructuredFindingsOrScrape(r)
lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity)
for _, f := range fs {
sev := f.severity
if sev == "" {
sev = lensSev
}
reports = append(reports, reportPayload{
Repo: repo,
PR: pr,
@@ -142,6 +162,7 @@ func emit(results []specialistResult, elapsed time.Duration) {
Provider: provider,
RunID: runID,
RawSeverity: sev,
Confidence: f.confidence,
Detail: f.detail,
})
}
+6 -3
View File
@@ -157,7 +157,7 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if len(reportBody) != 2 {
t.Fatalf("/reports array length = %d, want 2", len(reportBody))
}
for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "detail"} {
for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "confidence", "detail"} {
if _, ok := reportBody[0][k]; !ok {
t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0]))
}
@@ -171,8 +171,11 @@ func TestEmit_PostsRunsAndReports(t *testing.T) {
if reportBody[0]["line"] != float64(166) {
t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"])
}
if reportBody[0]["raw_severity"] != "Blocking issues found" {
t.Errorf("reports[0].raw_severity = %v, want 'Blocking issues found'", reportBody[0]["raw_severity"])
// No structured block in sampleLensMarkdown, so this is a heuristic-scraped
// finding: raw_severity is the canonical word derived from the lens verdict
// (Blocking -> "high"), not the full verdict phrase.
if reportBody[0]["raw_severity"] != "high" {
t.Errorf("reports[0].raw_severity = %v, want 'high'", reportBody[0]["raw_severity"])
}
if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" {
t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"])
+236
View File
@@ -0,0 +1,236 @@
package main
// Structured findings: the machine-readable contract between a lens's output and
// the telemetry/consolidation pipeline. Each lens is asked (see
// scripts/system-prompt.txt) to append a fenced ```gadfly-findings code block
// holding a JSON array of its findings. Parsing that exact block is far more
// reliable than scraping prose with a path:line regex (emit.go's heuristic),
// and it carries PER-FINDING severity + confidence the prose verdict can't.
//
// Everything here degrades gracefully: a missing, unterminated, or malformed
// block makes extractStructuredFindings return ok=false (and yield no findings),
// so the caller falls back to the heuristic scrape — a weak model that ignores
// the contract still contributes findings, exactly as before.
import (
"encoding/json"
"strconv"
"strings"
)
// structuredFinding mirrors one element of the ```gadfly-findings JSON array.
// Line is json.Number so we tolerate both 123 and "123" from less-precise models.
type structuredFinding struct {
File string `json:"file"`
Line json.Number `json:"line"`
Severity string `json:"severity"`
Confidence string `json:"confidence"`
Title string `json:"title"`
Detail string `json:"detail"` // optional; the prose paragraph is used when absent
}
// findingsFence is the info-string that tags the machine-readable block.
const findingsFence = "gadfly-findings"
// extractStructuredFindings parses the ```gadfly-findings JSON block out of a
// lens's markdown. It returns the findings and ok=true when a TERMINATED block is
// present AND parses as a JSON array (an empty array is valid "nothing found" —
// ok is still true). A missing, unterminated, or unparseable block returns
// ok=false so the caller falls back to the heuristic scrape.
//
// Findings are deduped by file:line (keeping the first, matching parseFindings),
// findings with no usable file are dropped, and each title/detail is backfilled
// from the prose when the JSON omits it (best of both: exact location + the human
// context the model already wrote).
func extractStructuredFindings(out string) ([]finding, bool) {
lines := strings.Split(out, "\n")
start, end, ok := findingsSpan(lines)
if !ok {
return nil, false
}
var raw []structuredFinding
if err := json.Unmarshal([]byte(strings.Join(lines[start+1:end], "\n")), &raw); err != nil {
return nil, false
}
findings := make([]finding, 0, len(raw))
seen := map[string]bool{}
var prose map[string]string // built lazily — only if a finding lacks its own detail
for _, sf := range raw {
file := strings.TrimSpace(sf.File)
if file == "" {
continue
}
ln := 0
if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil && n > 0 {
ln = n
}
key := file + ":" + strconv.Itoa(ln)
if ln > 0 { // only dedupe concrete locations; unknown-line findings are kept
if seen[key] {
continue
}
seen[key] = true
}
detail := strings.TrimSpace(sf.Detail)
if detail == "" && ln > 0 {
if prose == nil {
prose = proseParagraphs(out)
}
detail = prose[key]
}
title := strings.TrimSpace(sf.Title)
if title == "" { // never empty: fall back to the prose detail, then the location
if detail != "" {
title = truncate(detail, 120)
} else if ln > 0 {
title = key
} else {
title = file
}
}
findings = append(findings, finding{
file: file,
line: ln,
title: title,
detail: truncate(detail, 500),
severity: normalizeSeverity(sf.Severity),
confidence: normalizeConfidence(sf.Confidence),
})
if len(findings) >= maxFindingsPerLens {
break
}
}
return findings, true
}
// extractStructuredFindingsOrScrape returns a lens's findings, preferring the
// structured ```gadfly-findings block and falling back to the heuristic prose
// scrape when the block is absent, unterminated/malformed, OR parsed to zero
// usable findings (e.g. an empty [] emitted alongside real prose findings).
// Factored out of emit() so the fallback rule is unit-testable.
func extractStructuredFindingsOrScrape(r specialistResult) []finding {
if fs, _ := extractStructuredFindings(r.out); len(fs) > 0 {
return fs
}
return parseFindings(r.spec, r.out)
}
// stripFindingsBlock removes every TERMINATED ```gadfly-findings block from out so
// the machine-readable JSON never shows in the rendered comment. An UNTERMINATED
// fence is left in place — treating it as a block would swallow the rest of the
// comment (e.g. when a model's output was truncated mid-block). Trailing
// whitespace is trimmed.
func stripFindingsBlock(out string) string {
lines := strings.Split(out, "\n")
for {
start, end, ok := findingsSpan(lines)
if !ok {
break
}
lines = append(lines[:start:start], lines[end+1:]...)
}
return strings.TrimRight(strings.Join(lines, "\n"), "\n")
}
// findingsSpan returns the [start,end] inclusive line indices of the first
// TERMINATED ```gadfly-findings block in lines (start = the opening fence, end =
// the closing fence), or ok=false when there is none or it is unterminated.
// extract and strip share it so they always agree on what is (and isn't) a block.
func findingsSpan(lines []string) (start, end int, ok bool) {
start = -1
for i, ln := range lines {
if isFindingsOpen(ln) {
start = i
break
}
}
if start < 0 {
return 0, 0, false
}
for j := start + 1; j < len(lines); j++ {
if isFenceClose(lines[j]) {
return start, j, true
}
}
return 0, 0, false // unterminated
}
// fenceInfo returns the info-string (text after the backticks) of a code-fence
// line and whether the line opens/closes a fence at all. A bare ``` yields ("",
// true); ```gadfly-findings yields ("gadfly-findings", true).
func fenceInfo(line string) (string, bool) {
t := strings.TrimSpace(line)
if !strings.HasPrefix(t, "```") {
return "", false
}
return strings.TrimSpace(strings.TrimLeft(t, "`")), true
}
// isFindingsOpen reports whether line opens a ```gadfly-findings block, matching
// the info-string EXACTLY (not as a substring) so a fence like ```not-findings
// can't masquerade as ours.
func isFindingsOpen(line string) bool {
info, ok := fenceInfo(line)
return ok && info == findingsFence
}
// isFenceClose reports whether line is a bare closing fence (``` with no info).
func isFenceClose(line string) bool {
info, ok := fenceInfo(line)
return ok && info == ""
}
// proseParagraphs maps "file:line" -> the prose paragraph that first references
// it, so a structured finding without its own detail can borrow the human
// context the model already wrote. Built from the markdown OUTSIDE the findings
// block (the block is JSON, not prose).
func proseParagraphs(out string) map[string]string {
prose := stripFindingsBlock(out)
lines := strings.Split(prose, "\n")
m := map[string]string{}
for _, loc := range pathLineRe.FindAllStringSubmatchIndex(prose, -1) {
key := prose[loc[2]:loc[3]] + ":" + prose[loc[4]:loc[5]]
if _, dup := m[key]; dup {
continue
}
li := strings.Count(prose[:loc[0]], "\n")
m[key] = paragraphAt(lines, li)
}
return m
}
// normalizeSeverity maps a model's severity word onto the canonical set
// (critical/high/medium/small/trivial), accepting common synonyms. An
// unrecognized value is returned lowercased so the store still sees the raw word.
func normalizeSeverity(s string) string {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "critical", "crit", "blocker", "blocking":
return "critical"
case "high", "major", "severe":
return "high"
case "medium", "moderate":
return "medium"
case "small", "low", "minor":
return "small"
case "trivial", "nit", "nitpick", "info", "informational", "style", "cosmetic":
return "trivial"
default:
return t
}
}
// normalizeConfidence maps a model's confidence word onto high/medium/low,
// accepting common synonyms; an unrecognized value is returned lowercased.
func normalizeConfidence(s string) string {
switch t := strings.ToLower(strings.TrimSpace(s)); t {
case "high", "certain", "confirmed", "verified":
return "high"
case "medium", "med", "moderate":
return "medium"
case "low", "unsure", "tentative", "unverified", "speculative":
return "low"
default:
return t
}
}
+146
View File
@@ -0,0 +1,146 @@
package main
import (
"strings"
"testing"
)
const sampleReview = "**Blocking issues found**\n\n" +
"- **Unauthenticated endpoint** — `model.go:184` leaks PR content to a third party.\n" +
"- A nit about naming in `util.go:12`.\n\n" +
"```gadfly-findings\n" +
"[\n" +
" {\"file\": \"model.go\", \"line\": 184, \"severity\": \"high\", \"confidence\": \"high\", \"title\": \"Unauthenticated endpoint\"},\n" +
" {\"file\": \"util.go\", \"line\": 12, \"severity\": \"nit\", \"confidence\": \"medium\", \"title\": \"naming\"}\n" +
"]\n" +
"```\n"
func TestExtractStructuredFindings(t *testing.T) {
fs, ok := extractStructuredFindings(sampleReview)
if !ok {
t.Fatal("expected ok=true for a well-formed block")
}
if len(fs) != 2 {
t.Fatalf("want 2 findings, got %d", len(fs))
}
if fs[0].file != "model.go" || fs[0].line != 184 || fs[0].severity != "high" || fs[0].confidence != "high" {
t.Errorf("finding[0] mismatch: %+v", fs[0])
}
// "nit" must normalize to the canonical "trivial".
if fs[1].severity != "trivial" {
t.Errorf("want severity normalized to trivial, got %q", fs[1].severity)
}
// Detail is borrowed from the prose paragraph referencing the same file:line.
if fs[0].detail == "" {
t.Error("expected detail borrowed from prose, got empty")
}
}
func TestExtractStructuredFindingsEmptyArray(t *testing.T) {
out := "No material issues found\n\n```gadfly-findings\n[]\n```\n"
fs, ok := extractStructuredFindings(out)
if !ok {
t.Fatal("an empty array is a valid block; want ok=true")
}
if len(fs) != 0 {
t.Fatalf("want 0 findings, got %d", len(fs))
}
}
func TestExtractStructuredFindingsFallback(t *testing.T) {
// No block at all -> ok=false so the caller uses the heuristic scrape.
if _, ok := extractStructuredFindings("Minor issues\n\n- something at `x.go:1`\n"); ok {
t.Error("want ok=false when there is no block")
}
// Malformed JSON -> ok=false (graceful fallback).
bad := "Minor issues\n\n```gadfly-findings\n{not json}\n```\n"
if _, ok := extractStructuredFindings(bad); ok {
t.Error("want ok=false for malformed JSON")
}
}
func TestExtractStructuredFindingsStringLine(t *testing.T) {
// Tolerate a quoted line number from a less-precise model.
out := "Minor issues\n\n```gadfly-findings\n[{\"file\":\"a.go\",\"line\":\"42\",\"severity\":\"medium\",\"title\":\"x\"}]\n```\n"
fs, ok := extractStructuredFindings(out)
if !ok || len(fs) != 1 || fs[0].line != 42 {
t.Fatalf("want one finding at line 42, got ok=%v %+v", ok, fs)
}
}
func TestStripFindingsBlock(t *testing.T) {
stripped := stripFindingsBlock(sampleReview)
if strings.Contains(stripped, findingsFence) {
t.Errorf("block not stripped: %q", stripped)
}
// The prose findings must survive.
if !strings.Contains(stripped, "Unauthenticated endpoint") || !strings.Contains(stripped, "util.go:12") {
t.Errorf("prose lost during strip: %q", stripped)
}
}
func TestStripFindingsBlockUnterminated(t *testing.T) {
// A truncated, unterminated block must NOT swallow the prose before it.
out := "Minor issues\n\n- real finding at `x.go:1`\n\n```gadfly-findings\n[{\"file\":\"x.go\""
got := stripFindingsBlock(out)
if !strings.Contains(got, "real finding at `x.go:1`") {
t.Errorf("unterminated block swallowed the prose: %q", got)
}
}
func TestStripFindingsBlockNoBlock(t *testing.T) {
in := "Minor issues\n\n- finding at `x.go:9`"
if out := stripFindingsBlock(in); out != in {
t.Errorf("strip changed block-free text: %q != %q", out, in)
}
}
func TestNormalizeSeverity(t *testing.T) {
cases := map[string]string{
"Critical": "critical", "blocker": "critical",
"major": "high", "HIGH": "high",
"moderate": "medium",
"minor": "small", "low": "small", // "minor" and "low" both map to small (consistently)
"nit": "trivial", "Style": "trivial",
"weird": "weird", // unknown passes through, lowercased
}
for in, want := range cases {
if got := normalizeSeverity(in); got != want {
t.Errorf("normalizeSeverity(%q) = %q, want %q", in, got, want)
}
}
}
func TestNormalizeConfidence(t *testing.T) {
cases := map[string]string{
"High": "high", "confirmed": "high",
"MEDIUM": "medium", "moderate": "medium",
"low": "low", "unverified": "low",
"hunch": "hunch", // unknown passes through, lowercased
}
for in, want := range cases {
if got := normalizeConfidence(in); got != want {
t.Errorf("normalizeConfidence(%q) = %q, want %q", in, got, want)
}
}
}
func TestExtractStructuredFindingsFallbackOnEmpty(t *testing.T) {
// A non-clean lens that emitted an empty [] but listed prose findings must
// fall through to the heuristic scrape, not silently drop everything.
out := "Minor issues\n\n- bug at `pkg/a.go:7`\n\n```gadfly-findings\n[]\n```\n"
r := specialistResult{spec: Specialist{Name: "correctness"}, out: out, verdict: verdictMinor}
fs := extractStructuredFindingsOrScrape(r)
if len(fs) == 0 {
t.Fatal("empty [] must fall back to the heuristic scrape, got no findings")
}
if fs[0].file != "pkg/a.go" || fs[0].line != 7 {
t.Errorf("heuristic fallback wrong: %+v", fs[0])
}
}
func TestVerdictSeverity(t *testing.T) {
if verdictBlocking.severity() != "high" || verdictMinor.severity() != "small" || verdictUnknown.severity() != "trivial" {
t.Error("verdict.severity mapping changed unexpectedly")
}
}
+5
View File
@@ -49,6 +49,11 @@ Output rules:
- Do NOT invent new findings; this is a verification gate, not a fresh review.
- Do NOT include meta-commentary about the verification process or which
findings you dropped — output only the final, corrected review markdown.
- The draft ends with a fenced ` + "`gadfly-findings`" + ` JSON block. Regenerate it
so it lists ONLY the findings that SURVIVED your verification, in the same schema
({"file","line","severity","confidence","title"}; severity one of
critical/high/medium/small/trivial, confidence one of high/medium/low). If every
finding was dropped, emit an empty array ` + "`[]`" + `. Keep the block last.
- When done investigating, STOP calling tools and reply with the review.`
// recheckEnabled reports whether the verification pass should run. On unless
+18
View File
@@ -43,3 +43,21 @@ Output rules:
- Only report issues you are reasonably confident are real after checking. If the diff
is clean, say so plainly rather than inventing nits.
- When you are done investigating, STOP calling tools and reply with the final review.
Machine-readable findings — AFTER the prose review, append ONE fenced code block,
tagged `gadfly-findings`, holding a JSON array of the SAME findings you described above
(this block is consumed by tooling and hidden from the rendered comment):
```gadfly-findings
[
{"file": "path/to/file.go", "line": 123, "severity": "high", "confidence": "high", "title": "one-line summary of the issue"}
]
```
- One object per real finding, in the same order as your prose. `file`/`line` must be a
concrete location you verified (the line the issue is at). `severity` is one of
`critical`, `high`, `medium`, `small`, `trivial`. `confidence` is your post-verification
confidence the issue is real: one of `high`, `medium`, `low`.
- Include ONLY genuine problems — never verification notes ("confirmed X is safe at f:line"),
and never an "Outside my lens:" aside. If your lens is clean, emit an empty array `[]`.
- This block is in ADDITION to the prose; do not drop the human-readable findings.