From 53971603d3cf83629c506506e4b4d7042e1297fc Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 28 Jun 2026 22:23:02 +0000 Subject: [PATCH] feat: structured findings contract (machine-readable gadfly-findings block) (#16) Co-authored-by: Steve Dudenhoeffer Co-committed-by: Steve Dudenhoeffer --- cmd/gadfly/consolidate.go | 20 ++- cmd/gadfly/emit.go | 43 +++++-- cmd/gadfly/emit_test.go | 9 +- cmd/gadfly/findings.go | 236 ++++++++++++++++++++++++++++++++++++ cmd/gadfly/findings_test.go | 146 ++++++++++++++++++++++ cmd/gadfly/recheck.go | 5 + scripts/system-prompt.txt | 18 +++ 7 files changed, 462 insertions(+), 15 deletions(-) create mode 100644 cmd/gadfly/findings.go create mode 100644 cmd/gadfly/findings_test.go diff --git a/cmd/gadfly/consolidate.go b/cmd/gadfly/consolidate.go index 47f2e26..d51d308 100644 --- a/cmd/gadfly/consolidate.go +++ b/cmd/gadfly/consolidate.go @@ -28,6 +28,22 @@ func (v verdict) label() string { } } +// severity maps a verdict to a canonical severity word (the same vocabulary as +// the structured findings: critical/high/medium/small/trivial). Used as the +// raw_severity for heuristic-scraped findings, which carry no per-finding +// severity of their own — so the telemetry store sees one consistent vocabulary +// instead of a mix of canonical words and full verdict phrases. +func (v verdict) severity() string { + switch v { + case verdictBlocking: + return "high" + case verdictMinor: + return "small" + default: + return "trivial" + } +} + // parseVerdict extracts a specialist's self-reported verdict from its output. // The base prompt tells each lens to lead with one of the three phrases. func parseVerdict(out string) verdict { @@ -100,7 +116,9 @@ func renderConsolidated(results []specialistResult) string { headline, len(results), strings.Join(specialistNames(results), ", ")) for _, r := range results { - body := strings.TrimSpace(r.out) + // Strip the machine-readable ```gadfly-findings block — it's for tooling + // (telemetry/consolidation), not for human readers of the comment. + body := strings.TrimSpace(stripFindingsBlock(r.out)) if body == "" { body = "_(no output)_" } diff --git a/cmd/gadfly/emit.go b/cmd/gadfly/emit.go index 2cfc9c0..fbac59d 100644 --- a/cmd/gadfly/emit.go +++ b/cmd/gadfly/emit.go @@ -10,11 +10,14 @@ package main // the review markdown consumed by run.sh) and never changes the exit code. // - It depends only on the Go stdlib (net/http). // -// Findings are extracted heuristically from each lens's markdown: a "path:line" -// reference (e.g. run/executor.go:166) anchors a finding, whose title is the -// nearest preceding markdown heading / numbered item / bold lead-in (else the -// first sentence of the finding's paragraph). This is best-effort signal for the -// store to aggregate — not a structured contract the reviewer guarantees. +// Findings come PRIMARILY from each lens's machine-readable ```gadfly-findings +// block (findings.go: exact file/line + per-finding severity/confidence). When a +// model emits no parseable block, emit falls back to a heuristic prose scrape: a +// "path:line" reference (e.g. run/executor.go:166) anchors a finding, whose title +// is the nearest preceding markdown heading / numbered item / bold lead-in (else +// the first sentence of the finding's paragraph). The scrape is best-effort +// signal for the store to aggregate, not a structured contract the reviewer +// guarantees. import ( "bytes" @@ -50,11 +53,16 @@ var pathLineRe = regexp.MustCompile(`([A-Za-z0-9_./-]+\.[A-Za-z0-9]+):(\d+)`) var numberedRe = regexp.MustCompile(`^\d+[.)]\s+(.+)$`) // finding is one extracted issue: where it points and a derived human title. +// severity/confidence are populated from the structured ```gadfly-findings block +// (extractStructuredFindings); they are empty for findings recovered by the +// heuristic prose scrape (parseFindings), which has no per-finding signal. type finding struct { - file string - line int - title string - detail string + file string + line int + title string + detail string + severity string // per-finding: critical/high/medium/small/trivial ("" if heuristic) + confidence string // per-finding: high/medium/low ("" if heuristic) } // runPayload is the POST /runs body. Field names match the gadfly-reports API EXACTLY. @@ -86,6 +94,7 @@ type reportPayload struct { Provider string `json:"provider"` RunID string `json:"run_id"` RawSeverity string `json:"raw_severity"` + Confidence string `json:"confidence"` // per-finding high/medium/low ("" if heuristic) Detail string `json:"detail"` } @@ -129,8 +138,19 @@ func emit(results []specialistResult, elapsed time.Duration) { if r.verdict == verdictClean { continue } - sev := r.verdict.label() - for _, f := range parseFindings(r.spec, r.out) { + // Prefer the model's machine-readable ```gadfly-findings block (exact + // location + per-finding severity). Fall back to the prose scrape when it + // yields NOTHING — no block, an unterminated/malformed one, or a block that + // parsed to zero usable findings on a non-clean lens (e.g. an empty [] + // alongside prose findings). Falling back on empty, not just on absent, + // keeps the safety net from being defeated by a contradictory empty block. + fs := extractStructuredFindingsOrScrape(r) + lensSev := r.verdict.severity() // canonical word for heuristic findings (no per-finding severity) + for _, f := range fs { + sev := f.severity + if sev == "" { + sev = lensSev + } reports = append(reports, reportPayload{ Repo: repo, PR: pr, @@ -142,6 +162,7 @@ func emit(results []specialistResult, elapsed time.Duration) { Provider: provider, RunID: runID, RawSeverity: sev, + Confidence: f.confidence, Detail: f.detail, }) } diff --git a/cmd/gadfly/emit_test.go b/cmd/gadfly/emit_test.go index fe1b29b..aab5b4b 100644 --- a/cmd/gadfly/emit_test.go +++ b/cmd/gadfly/emit_test.go @@ -157,7 +157,7 @@ func TestEmit_PostsRunsAndReports(t *testing.T) { if len(reportBody) != 2 { t.Fatalf("/reports array length = %d, want 2", len(reportBody)) } - for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "detail"} { + for _, k := range []string{"repo", "pr", "lens", "file", "line", "title", "model", "provider", "run_id", "raw_severity", "confidence", "detail"} { if _, ok := reportBody[0][k]; !ok { t.Errorf("/reports[0] missing field %q (got keys %v)", k, keysOf(reportBody[0])) } @@ -171,8 +171,11 @@ func TestEmit_PostsRunsAndReports(t *testing.T) { if reportBody[0]["line"] != float64(166) { t.Errorf("reports[0].line = %v, want 166", reportBody[0]["line"]) } - if reportBody[0]["raw_severity"] != "Blocking issues found" { - t.Errorf("reports[0].raw_severity = %v, want 'Blocking issues found'", reportBody[0]["raw_severity"]) + // No structured block in sampleLensMarkdown, so this is a heuristic-scraped + // finding: raw_severity is the canonical word derived from the lens verdict + // (Blocking -> "high"), not the full verdict phrase. + if reportBody[0]["raw_severity"] != "high" { + t.Errorf("reports[0].raw_severity = %v, want 'high'", reportBody[0]["raw_severity"]) } if reportBody[0]["run_id"] != "owner/repo#7:ollama-cloud/qwen3" { t.Errorf("reports[0].run_id = %v, want owner/repo#7:ollama-cloud/qwen3", reportBody[0]["run_id"]) diff --git a/cmd/gadfly/findings.go b/cmd/gadfly/findings.go new file mode 100644 index 0000000..d0a3be7 --- /dev/null +++ b/cmd/gadfly/findings.go @@ -0,0 +1,236 @@ +package main + +// Structured findings: the machine-readable contract between a lens's output and +// the telemetry/consolidation pipeline. Each lens is asked (see +// scripts/system-prompt.txt) to append a fenced ```gadfly-findings code block +// holding a JSON array of its findings. Parsing that exact block is far more +// reliable than scraping prose with a path:line regex (emit.go's heuristic), +// and it carries PER-FINDING severity + confidence the prose verdict can't. +// +// Everything here degrades gracefully: a missing, unterminated, or malformed +// block makes extractStructuredFindings return ok=false (and yield no findings), +// so the caller falls back to the heuristic scrape — a weak model that ignores +// the contract still contributes findings, exactly as before. + +import ( + "encoding/json" + "strconv" + "strings" +) + +// structuredFinding mirrors one element of the ```gadfly-findings JSON array. +// Line is json.Number so we tolerate both 123 and "123" from less-precise models. +type structuredFinding struct { + File string `json:"file"` + Line json.Number `json:"line"` + Severity string `json:"severity"` + Confidence string `json:"confidence"` + Title string `json:"title"` + Detail string `json:"detail"` // optional; the prose paragraph is used when absent +} + +// findingsFence is the info-string that tags the machine-readable block. +const findingsFence = "gadfly-findings" + +// extractStructuredFindings parses the ```gadfly-findings JSON block out of a +// lens's markdown. It returns the findings and ok=true when a TERMINATED block is +// present AND parses as a JSON array (an empty array is valid "nothing found" — +// ok is still true). A missing, unterminated, or unparseable block returns +// ok=false so the caller falls back to the heuristic scrape. +// +// Findings are deduped by file:line (keeping the first, matching parseFindings), +// findings with no usable file are dropped, and each title/detail is backfilled +// from the prose when the JSON omits it (best of both: exact location + the human +// context the model already wrote). +func extractStructuredFindings(out string) ([]finding, bool) { + lines := strings.Split(out, "\n") + start, end, ok := findingsSpan(lines) + if !ok { + return nil, false + } + var raw []structuredFinding + if err := json.Unmarshal([]byte(strings.Join(lines[start+1:end], "\n")), &raw); err != nil { + return nil, false + } + + findings := make([]finding, 0, len(raw)) + seen := map[string]bool{} + var prose map[string]string // built lazily — only if a finding lacks its own detail + for _, sf := range raw { + file := strings.TrimSpace(sf.File) + if file == "" { + continue + } + ln := 0 + if n, err := strconv.Atoi(strings.TrimSpace(sf.Line.String())); err == nil && n > 0 { + ln = n + } + key := file + ":" + strconv.Itoa(ln) + if ln > 0 { // only dedupe concrete locations; unknown-line findings are kept + if seen[key] { + continue + } + seen[key] = true + } + detail := strings.TrimSpace(sf.Detail) + if detail == "" && ln > 0 { + if prose == nil { + prose = proseParagraphs(out) + } + detail = prose[key] + } + title := strings.TrimSpace(sf.Title) + if title == "" { // never empty: fall back to the prose detail, then the location + if detail != "" { + title = truncate(detail, 120) + } else if ln > 0 { + title = key + } else { + title = file + } + } + findings = append(findings, finding{ + file: file, + line: ln, + title: title, + detail: truncate(detail, 500), + severity: normalizeSeverity(sf.Severity), + confidence: normalizeConfidence(sf.Confidence), + }) + if len(findings) >= maxFindingsPerLens { + break + } + } + return findings, true +} + +// extractStructuredFindingsOrScrape returns a lens's findings, preferring the +// structured ```gadfly-findings block and falling back to the heuristic prose +// scrape when the block is absent, unterminated/malformed, OR parsed to zero +// usable findings (e.g. an empty [] emitted alongside real prose findings). +// Factored out of emit() so the fallback rule is unit-testable. +func extractStructuredFindingsOrScrape(r specialistResult) []finding { + if fs, _ := extractStructuredFindings(r.out); len(fs) > 0 { + return fs + } + return parseFindings(r.spec, r.out) +} + +// stripFindingsBlock removes every TERMINATED ```gadfly-findings block from out so +// the machine-readable JSON never shows in the rendered comment. An UNTERMINATED +// fence is left in place — treating it as a block would swallow the rest of the +// comment (e.g. when a model's output was truncated mid-block). Trailing +// whitespace is trimmed. +func stripFindingsBlock(out string) string { + lines := strings.Split(out, "\n") + for { + start, end, ok := findingsSpan(lines) + if !ok { + break + } + lines = append(lines[:start:start], lines[end+1:]...) + } + return strings.TrimRight(strings.Join(lines, "\n"), "\n") +} + +// findingsSpan returns the [start,end] inclusive line indices of the first +// TERMINATED ```gadfly-findings block in lines (start = the opening fence, end = +// the closing fence), or ok=false when there is none or it is unterminated. +// extract and strip share it so they always agree on what is (and isn't) a block. +func findingsSpan(lines []string) (start, end int, ok bool) { + start = -1 + for i, ln := range lines { + if isFindingsOpen(ln) { + start = i + break + } + } + if start < 0 { + return 0, 0, false + } + for j := start + 1; j < len(lines); j++ { + if isFenceClose(lines[j]) { + return start, j, true + } + } + return 0, 0, false // unterminated +} + +// fenceInfo returns the info-string (text after the backticks) of a code-fence +// line and whether the line opens/closes a fence at all. A bare ``` yields ("", +// true); ```gadfly-findings yields ("gadfly-findings", true). +func fenceInfo(line string) (string, bool) { + t := strings.TrimSpace(line) + if !strings.HasPrefix(t, "```") { + return "", false + } + return strings.TrimSpace(strings.TrimLeft(t, "`")), true +} + +// isFindingsOpen reports whether line opens a ```gadfly-findings block, matching +// the info-string EXACTLY (not as a substring) so a fence like ```not-findings +// can't masquerade as ours. +func isFindingsOpen(line string) bool { + info, ok := fenceInfo(line) + return ok && info == findingsFence +} + +// isFenceClose reports whether line is a bare closing fence (``` with no info). +func isFenceClose(line string) bool { + info, ok := fenceInfo(line) + return ok && info == "" +} + +// proseParagraphs maps "file:line" -> the prose paragraph that first references +// it, so a structured finding without its own detail can borrow the human +// context the model already wrote. Built from the markdown OUTSIDE the findings +// block (the block is JSON, not prose). +func proseParagraphs(out string) map[string]string { + prose := stripFindingsBlock(out) + lines := strings.Split(prose, "\n") + m := map[string]string{} + for _, loc := range pathLineRe.FindAllStringSubmatchIndex(prose, -1) { + key := prose[loc[2]:loc[3]] + ":" + prose[loc[4]:loc[5]] + if _, dup := m[key]; dup { + continue + } + li := strings.Count(prose[:loc[0]], "\n") + m[key] = paragraphAt(lines, li) + } + return m +} + +// normalizeSeverity maps a model's severity word onto the canonical set +// (critical/high/medium/small/trivial), accepting common synonyms. An +// unrecognized value is returned lowercased so the store still sees the raw word. +func normalizeSeverity(s string) string { + switch t := strings.ToLower(strings.TrimSpace(s)); t { + case "critical", "crit", "blocker", "blocking": + return "critical" + case "high", "major", "severe": + return "high" + case "medium", "moderate": + return "medium" + case "small", "low", "minor": + return "small" + case "trivial", "nit", "nitpick", "info", "informational", "style", "cosmetic": + return "trivial" + default: + return t + } +} + +// normalizeConfidence maps a model's confidence word onto high/medium/low, +// accepting common synonyms; an unrecognized value is returned lowercased. +func normalizeConfidence(s string) string { + switch t := strings.ToLower(strings.TrimSpace(s)); t { + case "high", "certain", "confirmed", "verified": + return "high" + case "medium", "med", "moderate": + return "medium" + case "low", "unsure", "tentative", "unverified", "speculative": + return "low" + default: + return t + } +} diff --git a/cmd/gadfly/findings_test.go b/cmd/gadfly/findings_test.go new file mode 100644 index 0000000..c4b31f6 --- /dev/null +++ b/cmd/gadfly/findings_test.go @@ -0,0 +1,146 @@ +package main + +import ( + "strings" + "testing" +) + +const sampleReview = "**Blocking issues found**\n\n" + + "- **Unauthenticated endpoint** — `model.go:184` leaks PR content to a third party.\n" + + "- A nit about naming in `util.go:12`.\n\n" + + "```gadfly-findings\n" + + "[\n" + + " {\"file\": \"model.go\", \"line\": 184, \"severity\": \"high\", \"confidence\": \"high\", \"title\": \"Unauthenticated endpoint\"},\n" + + " {\"file\": \"util.go\", \"line\": 12, \"severity\": \"nit\", \"confidence\": \"medium\", \"title\": \"naming\"}\n" + + "]\n" + + "```\n" + +func TestExtractStructuredFindings(t *testing.T) { + fs, ok := extractStructuredFindings(sampleReview) + if !ok { + t.Fatal("expected ok=true for a well-formed block") + } + if len(fs) != 2 { + t.Fatalf("want 2 findings, got %d", len(fs)) + } + if fs[0].file != "model.go" || fs[0].line != 184 || fs[0].severity != "high" || fs[0].confidence != "high" { + t.Errorf("finding[0] mismatch: %+v", fs[0]) + } + // "nit" must normalize to the canonical "trivial". + if fs[1].severity != "trivial" { + t.Errorf("want severity normalized to trivial, got %q", fs[1].severity) + } + // Detail is borrowed from the prose paragraph referencing the same file:line. + if fs[0].detail == "" { + t.Error("expected detail borrowed from prose, got empty") + } +} + +func TestExtractStructuredFindingsEmptyArray(t *testing.T) { + out := "No material issues found\n\n```gadfly-findings\n[]\n```\n" + fs, ok := extractStructuredFindings(out) + if !ok { + t.Fatal("an empty array is a valid block; want ok=true") + } + if len(fs) != 0 { + t.Fatalf("want 0 findings, got %d", len(fs)) + } +} + +func TestExtractStructuredFindingsFallback(t *testing.T) { + // No block at all -> ok=false so the caller uses the heuristic scrape. + if _, ok := extractStructuredFindings("Minor issues\n\n- something at `x.go:1`\n"); ok { + t.Error("want ok=false when there is no block") + } + // Malformed JSON -> ok=false (graceful fallback). + bad := "Minor issues\n\n```gadfly-findings\n{not json}\n```\n" + if _, ok := extractStructuredFindings(bad); ok { + t.Error("want ok=false for malformed JSON") + } +} + +func TestExtractStructuredFindingsStringLine(t *testing.T) { + // Tolerate a quoted line number from a less-precise model. + out := "Minor issues\n\n```gadfly-findings\n[{\"file\":\"a.go\",\"line\":\"42\",\"severity\":\"medium\",\"title\":\"x\"}]\n```\n" + fs, ok := extractStructuredFindings(out) + if !ok || len(fs) != 1 || fs[0].line != 42 { + t.Fatalf("want one finding at line 42, got ok=%v %+v", ok, fs) + } +} + +func TestStripFindingsBlock(t *testing.T) { + stripped := stripFindingsBlock(sampleReview) + if strings.Contains(stripped, findingsFence) { + t.Errorf("block not stripped: %q", stripped) + } + // The prose findings must survive. + if !strings.Contains(stripped, "Unauthenticated endpoint") || !strings.Contains(stripped, "util.go:12") { + t.Errorf("prose lost during strip: %q", stripped) + } +} + +func TestStripFindingsBlockUnterminated(t *testing.T) { + // A truncated, unterminated block must NOT swallow the prose before it. + out := "Minor issues\n\n- real finding at `x.go:1`\n\n```gadfly-findings\n[{\"file\":\"x.go\"" + got := stripFindingsBlock(out) + if !strings.Contains(got, "real finding at `x.go:1`") { + t.Errorf("unterminated block swallowed the prose: %q", got) + } +} + +func TestStripFindingsBlockNoBlock(t *testing.T) { + in := "Minor issues\n\n- finding at `x.go:9`" + if out := stripFindingsBlock(in); out != in { + t.Errorf("strip changed block-free text: %q != %q", out, in) + } +} + +func TestNormalizeSeverity(t *testing.T) { + cases := map[string]string{ + "Critical": "critical", "blocker": "critical", + "major": "high", "HIGH": "high", + "moderate": "medium", + "minor": "small", "low": "small", // "minor" and "low" both map to small (consistently) + "nit": "trivial", "Style": "trivial", + "weird": "weird", // unknown passes through, lowercased + } + for in, want := range cases { + if got := normalizeSeverity(in); got != want { + t.Errorf("normalizeSeverity(%q) = %q, want %q", in, got, want) + } + } +} + +func TestNormalizeConfidence(t *testing.T) { + cases := map[string]string{ + "High": "high", "confirmed": "high", + "MEDIUM": "medium", "moderate": "medium", + "low": "low", "unverified": "low", + "hunch": "hunch", // unknown passes through, lowercased + } + for in, want := range cases { + if got := normalizeConfidence(in); got != want { + t.Errorf("normalizeConfidence(%q) = %q, want %q", in, got, want) + } + } +} + +func TestExtractStructuredFindingsFallbackOnEmpty(t *testing.T) { + // A non-clean lens that emitted an empty [] but listed prose findings must + // fall through to the heuristic scrape, not silently drop everything. + out := "Minor issues\n\n- bug at `pkg/a.go:7`\n\n```gadfly-findings\n[]\n```\n" + r := specialistResult{spec: Specialist{Name: "correctness"}, out: out, verdict: verdictMinor} + fs := extractStructuredFindingsOrScrape(r) + if len(fs) == 0 { + t.Fatal("empty [] must fall back to the heuristic scrape, got no findings") + } + if fs[0].file != "pkg/a.go" || fs[0].line != 7 { + t.Errorf("heuristic fallback wrong: %+v", fs[0]) + } +} + +func TestVerdictSeverity(t *testing.T) { + if verdictBlocking.severity() != "high" || verdictMinor.severity() != "small" || verdictUnknown.severity() != "trivial" { + t.Error("verdict.severity mapping changed unexpectedly") + } +} diff --git a/cmd/gadfly/recheck.go b/cmd/gadfly/recheck.go index 0255744..d23502c 100644 --- a/cmd/gadfly/recheck.go +++ b/cmd/gadfly/recheck.go @@ -49,6 +49,11 @@ Output rules: - Do NOT invent new findings; this is a verification gate, not a fresh review. - Do NOT include meta-commentary about the verification process or which findings you dropped — output only the final, corrected review markdown. +- The draft ends with a fenced ` + "`gadfly-findings`" + ` JSON block. Regenerate it + so it lists ONLY the findings that SURVIVED your verification, in the same schema + ({"file","line","severity","confidence","title"}; severity one of + critical/high/medium/small/trivial, confidence one of high/medium/low). If every + finding was dropped, emit an empty array ` + "`[]`" + `. Keep the block last. - When done investigating, STOP calling tools and reply with the review.` // recheckEnabled reports whether the verification pass should run. On unless diff --git a/scripts/system-prompt.txt b/scripts/system-prompt.txt index 28960c8..ff4ebe8 100644 --- a/scripts/system-prompt.txt +++ b/scripts/system-prompt.txt @@ -43,3 +43,21 @@ Output rules: - Only report issues you are reasonably confident are real after checking. If the diff is clean, say so plainly rather than inventing nits. - When you are done investigating, STOP calling tools and reply with the final review. + +Machine-readable findings — AFTER the prose review, append ONE fenced code block, +tagged `gadfly-findings`, holding a JSON array of the SAME findings you described above +(this block is consumed by tooling and hidden from the rendered comment): + +```gadfly-findings +[ + {"file": "path/to/file.go", "line": 123, "severity": "high", "confidence": "high", "title": "one-line summary of the issue"} +] +``` + +- One object per real finding, in the same order as your prose. `file`/`line` must be a + concrete location you verified (the line the issue is at). `severity` is one of + `critical`, `high`, `medium`, `small`, `trivial`. `confidence` is your post-verification + confidence the issue is real: one of `high`, `medium`, `low`. +- Include ONLY genuine problems — never verification notes ("confirmed X is safe at f:line"), + and never an "Outside my lens:" aside. If your lens is clean, emit an empty array `[]`. +- This block is in ADDITION to the prose; do not drop the human-readable findings.