feat: gadfly-reports — findings store + scoreboard daemon
Build & push image / build-and-push (push) Successful in 1m13s
CI / test (push) Successful in 10m39s

SQLite-backed HTTP store for Gadfly review findings, per-review run timings, and human/Claude grades, with a points-free per-model scoreboard. Pure fact store: it computes no points or rankings (the dashboard maps severity->points client-side and retunes without re-scoring). Findings are content-addressed by location so cross-model reports collapse for consensus; one grade per finding, latest wins. Pure-Go SQLite (CGO-free) + Docker image CI + tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 23:55:24 -04:00
parent 52dce5eb2f
commit ddcf42a3ce
16 changed files with 1269 additions and 27 deletions
+447
View File
@@ -0,0 +1,447 @@
package main
import (
"crypto/sha256"
"database/sql"
"encoding/hex"
"fmt"
"sort"
"strings"
"time"
_ "modernc.org/sqlite"
)
// gadfly-reports stores only RAW review facts: which model reported which finding, how
// long each model's review took, and a human/Claude grade (is_real + severity +
// usefulness). It deliberately does NOT compute points or rankings — the
// dashboard owns the scoring curve (severity -> points, value-per-minute), so it
// can be retuned without re-scoring or migrating stored data. The severity
// vocabulary below is the only scoring-related contract.
// validSeverities is the closed set a grade may assign to a REAL finding. The
// client maps these to points however it likes (e.g. trivial=1 … critical=20).
var validSeverities = map[string]bool{
"trivial": true,
"small": true,
"medium": true,
"high": true,
"critical": true,
}
const schema = `
CREATE TABLE IF NOT EXISTS runs (
run_id TEXT PRIMARY KEY,
repo TEXT NOT NULL,
pr INTEGER NOT NULL,
model TEXT NOT NULL,
provider TEXT NOT NULL,
lenses INTEGER NOT NULL DEFAULT 0,
duration_secs REAL NOT NULL DEFAULT 0,
input_tokens INTEGER,
output_tokens INTEGER,
cost_usd REAL,
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS findings (
id TEXT PRIMARY KEY,
repo TEXT NOT NULL,
pr INTEGER NOT NULL,
lens TEXT NOT NULL,
file TEXT,
line INTEGER,
title TEXT NOT NULL,
first_seen TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS reports (
id INTEGER PRIMARY KEY AUTOINCREMENT,
finding_id TEXT NOT NULL,
run_id TEXT NOT NULL,
model TEXT NOT NULL,
provider TEXT NOT NULL,
raw_severity TEXT,
detail TEXT,
created_at TEXT NOT NULL,
UNIQUE(finding_id, run_id)
);
CREATE INDEX IF NOT EXISTS idx_reports_finding ON reports(finding_id);
CREATE INDEX IF NOT EXISTS idx_reports_model ON reports(model);
CREATE TABLE IF NOT EXISTS grades (
id INTEGER PRIMARY KEY AUTOINCREMENT,
finding_id TEXT NOT NULL,
is_real INTEGER NOT NULL,
severity TEXT,
usefulness INTEGER,
notes TEXT,
grader TEXT,
created_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_grades_finding ON grades(finding_id);
-- latest_grades: the most recent grade per finding (grade history is kept; the
-- latest wins). Used by every read path so a re-grade supersedes the old one.
CREATE VIEW IF NOT EXISTS latest_grades AS
SELECT g.* FROM grades g
JOIN (SELECT finding_id, MAX(id) AS max_id FROM grades GROUP BY finding_id) m
ON g.id = m.max_id;
`
// Store is the SQLite-backed fact store.
type Store struct{ db *sql.DB }
// Open opens (creating if needed) the SQLite database at path and applies the
// schema. WAL + a busy timeout keep the single-writer daemon honest under the
// occasional concurrent reader.
func Open(path string) (*Store, error) {
db, err := sql.Open("sqlite", "file:"+path+"?_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)")
if err != nil {
return nil, fmt.Errorf("open %s: %w", path, err)
}
// modernc's pure-Go driver is happiest with a single writer connection.
db.SetMaxOpenConns(1)
if _, err := db.Exec(schema); err != nil {
db.Close()
return nil, fmt.Errorf("migrate: %w", err)
}
return &Store{db: db}, nil
}
func (s *Store) Close() error { return s.db.Close() }
func now() string { return time.Now().UTC().Format(time.RFC3339) }
// findingID content-addresses a finding by location, NOT by wording, so the same
// issue raised by different models (or re-raised on a re-review) collapses to one
// finding with many reports — that collapse is what makes cross-model consensus
// and per-model precision measurable. Title is intentionally excluded.
func findingID(repo string, pr int, lens, file string, line int) string {
key := fmt.Sprintf("%s|%d|%s|%s|%d",
strings.TrimSpace(repo), pr, strings.ToLower(strings.TrimSpace(lens)),
strings.TrimSpace(file), line)
sum := sha256.Sum256([]byte(key))
return hex.EncodeToString(sum[:])[:16]
}
// Run is one model's review of one PR — the unit run.sh times.
type Run struct {
RunID string `json:"run_id"`
Repo string `json:"repo"`
PR int `json:"pr"`
Model string `json:"model"`
Provider string `json:"provider"`
Lenses int `json:"lenses"`
DurationSecs float64 `json:"duration_secs"`
InputTokens *int64 `json:"input_tokens,omitempty"`
OutputTokens *int64 `json:"output_tokens,omitempty"`
CostUSD *float64 `json:"cost_usd,omitempty"`
}
// AddRun upserts a run by run_id (a re-posted run overwrites timing/tokens).
func (s *Store) AddRun(r Run) error {
if strings.TrimSpace(r.RunID) == "" || strings.TrimSpace(r.Model) == "" {
return fmt.Errorf("run_id and model are required")
}
_, err := s.db.Exec(`
INSERT INTO runs (run_id, repo, pr, model, provider, lenses, duration_secs, input_tokens, output_tokens, cost_usd, created_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(run_id) DO UPDATE SET
repo=excluded.repo, pr=excluded.pr, model=excluded.model, provider=excluded.provider,
lenses=excluded.lenses, duration_secs=excluded.duration_secs,
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens, cost_usd=excluded.cost_usd`,
r.RunID, r.Repo, r.PR, r.Model, r.Provider, r.Lenses, r.DurationSecs,
r.InputTokens, r.OutputTokens, r.CostUSD, now())
return err
}
// ReportIn is one finding as a single model reported it.
type ReportIn struct {
Repo string `json:"repo"`
PR int `json:"pr"`
Lens string `json:"lens"`
File string `json:"file"`
Line int `json:"line"`
Title string `json:"title"`
Model string `json:"model"`
Provider string `json:"provider"`
RunID string `json:"run_id"`
RawSeverity string `json:"raw_severity"`
Detail string `json:"detail"`
}
// AddReports records a batch of findings: each upserts its (content-addressed)
// finding row and adds this model's report of it. Returns the finding id per
// input (same order). A model re-reporting the same finding in the same run is a
// no-op (UNIQUE finding_id,run_id).
func (s *Store) AddReports(in []ReportIn) ([]string, error) {
tx, err := s.db.Begin()
if err != nil {
return nil, err
}
defer tx.Rollback()
ts := now()
ids := make([]string, len(in))
for i, r := range in {
if strings.TrimSpace(r.Title) == "" || strings.TrimSpace(r.Lens) == "" {
return nil, fmt.Errorf("report %d: lens and title are required", i)
}
id := findingID(r.Repo, r.PR, r.Lens, r.File, r.Line)
ids[i] = id
if _, err := tx.Exec(`
INSERT INTO findings (id, repo, pr, lens, file, line, title, first_seen)
VALUES (?,?,?,?,?,?,?,?) ON CONFLICT(id) DO NOTHING`,
id, r.Repo, r.PR, strings.ToLower(strings.TrimSpace(r.Lens)), r.File, r.Line, r.Title, ts); err != nil {
return nil, err
}
if _, err := tx.Exec(`
INSERT INTO reports (finding_id, run_id, model, provider, raw_severity, detail, created_at)
VALUES (?,?,?,?,?,?,?) ON CONFLICT(finding_id, run_id) DO NOTHING`,
id, r.RunID, r.Model, r.Provider, r.RawSeverity, r.Detail, ts); err != nil {
return nil, err
}
}
return ids, tx.Commit()
}
// Grade is a triage verdict on a finding. Severity is required when is_real and
// must be one of validSeverities; it is cleared when !is_real. No points here —
// the client maps severity -> points.
type Grade struct {
FindingID string `json:"finding_id"`
IsReal bool `json:"is_real"`
Severity string `json:"severity,omitempty"`
Usefulness *int `json:"usefulness,omitempty"`
Notes string `json:"notes,omitempty"`
Grader string `json:"grader,omitempty"`
}
// AddGrade appends a grade (history is kept; latest wins).
func (s *Store) AddGrade(g Grade) error {
if strings.TrimSpace(g.FindingID) == "" {
return fmt.Errorf("finding_id is required")
}
var exists bool
if err := s.db.QueryRow(`SELECT EXISTS(SELECT 1 FROM findings WHERE id=?)`, g.FindingID).Scan(&exists); err != nil {
return err
}
if !exists {
return fmt.Errorf("unknown finding_id %q", g.FindingID)
}
sev := strings.ToLower(strings.TrimSpace(g.Severity))
if g.IsReal {
if !validSeverities[sev] {
return fmt.Errorf("severity %q invalid for a real finding (want one of: %s)", g.Severity, strings.Join(sortedSeverities(), ", "))
}
} else {
sev = "" // a false positive carries no severity
}
if g.Usefulness != nil && (*g.Usefulness < 1 || *g.Usefulness > 5) {
return fmt.Errorf("usefulness must be 1..5, got %d", *g.Usefulness)
}
_, err := s.db.Exec(`
INSERT INTO grades (finding_id, is_real, severity, usefulness, notes, grader, created_at)
VALUES (?,?,?,?,?,?,?)`,
g.FindingID, g.IsReal, nullStr(sev), g.Usefulness, nullStr(g.Notes), nullStr(g.Grader), now())
return err
}
// ExportRow is one report joined with its finding, run timing, and latest grade
// — the flat shape a dashboard consumes. Grade fields are nil/empty until graded.
type ExportRow struct {
FindingID string `json:"finding_id"`
Repo string `json:"repo"`
PR int `json:"pr"`
Lens string `json:"lens"`
File string `json:"file,omitempty"`
Line int `json:"line,omitempty"`
Title string `json:"title"`
Model string `json:"model"`
Provider string `json:"provider,omitempty"`
RunID string `json:"run_id"`
RawSeverity string `json:"raw_severity,omitempty"`
ReportedAt string `json:"reported_at"`
DurationSecs float64 `json:"duration_secs"`
InputTokens *int64 `json:"input_tokens,omitempty"`
OutputTokens *int64 `json:"output_tokens,omitempty"`
Graded bool `json:"graded"`
IsReal *bool `json:"is_real,omitempty"`
Severity string `json:"severity,omitempty"`
Usefulness *int `json:"usefulness,omitempty"`
Notes string `json:"notes,omitempty"`
Grader string `json:"grader,omitempty"`
GradedAt string `json:"graded_at,omitempty"`
}
// Export returns every report joined with finding, run timing, and latest grade,
// oldest first. The dashboard does all weighting from these raw rows.
func (s *Store) Export() ([]ExportRow, error) {
rows, err := s.db.Query(`
SELECT r.finding_id, f.repo, f.pr, f.lens, f.file, f.line, f.title,
r.model, r.provider, r.run_id, r.raw_severity, r.created_at,
COALESCE(ru.duration_secs, 0), ru.input_tokens, ru.output_tokens,
lg.is_real, lg.severity, lg.usefulness, lg.notes, lg.grader, lg.created_at
FROM reports r
JOIN findings f ON f.id = r.finding_id
LEFT JOIN runs ru ON ru.run_id = r.run_id
LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
ORDER BY r.created_at, r.id`)
if err != nil {
return nil, err
}
defer rows.Close()
var out []ExportRow
for rows.Next() {
var e ExportRow
var file, rawSev, sev, notes, grader, gradedAt sql.NullString
var line sql.NullInt64
var isReal sql.NullBool
var useful sql.NullInt64
if err := rows.Scan(&e.FindingID, &e.Repo, &e.PR, &e.Lens, &file, &line, &e.Title,
&e.Model, &e.Provider, &e.RunID, &rawSev, &e.ReportedAt,
&e.DurationSecs, &e.InputTokens, &e.OutputTokens,
&isReal, &sev, &useful, &notes, &grader, &gradedAt); err != nil {
return nil, err
}
e.File, e.Line = file.String, int(line.Int64)
e.RawSeverity = rawSev.String
if isReal.Valid {
e.Graded = true
v := isReal.Bool
e.IsReal = &v
e.Severity, e.Notes, e.Grader, e.GradedAt = sev.String, notes.String, grader.String, gradedAt.String
if useful.Valid {
u := int(useful.Int64)
e.Usefulness = &u
}
}
out = append(out, e)
}
return out, rows.Err()
}
// ModelStat is the per-model rollup the scoreboard returns. It is intentionally
// POINTS-FREE: raw minutes/tokens and a confirmed-by-severity histogram, so the
// client applies its own weights for points and value-per-minute/token.
type ModelStat struct {
Model string `json:"model"`
Provider string `json:"provider,omitempty"`
Runs int `json:"runs"`
Minutes float64 `json:"minutes"`
InputTokens int64 `json:"input_tokens"`
OutputTokens int64 `json:"output_tokens"`
Findings int `json:"findings"`
Confirmed int `json:"confirmed"`
FalsePositive int `json:"false_positive"`
Ungraded int `json:"ungraded"`
BySeverity map[string]int `json:"by_severity"` // confirmed findings per severity
}
// Scoreboard rolls runs + reports + latest grades up per model. All counts of
// findings are DISTINCT by finding (a model re-reporting across runs counts once).
func (s *Store) Scoreboard() ([]ModelStat, error) {
stats := map[string]*ModelStat{}
get := func(model, provider string) *ModelStat {
m, ok := stats[model]
if !ok {
m = &ModelStat{Model: model, Provider: provider, BySeverity: map[string]int{}}
stats[model] = m
}
return m
}
// Runs: minutes + tokens + run counts.
rrows, err := s.db.Query(`
SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
FROM runs GROUP BY model, provider`)
if err != nil {
return nil, err
}
for rrows.Next() {
var model, provider string
var runs int
var dur float64
var in, out int64
if err := rrows.Scan(&model, &provider, &runs, &dur, &in, &out); err != nil {
rrows.Close()
return nil, err
}
m := get(model, provider)
m.Runs += runs
m.Minutes += dur / 60
m.InputTokens += in
m.OutputTokens += out
}
rrows.Close()
// Findings: distinct per model, split by latest-grade state.
frows, err := s.db.Query(`
SELECT r.model,
COUNT(DISTINCT r.finding_id),
COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
GROUP BY r.model`)
if err != nil {
return nil, err
}
for frows.Next() {
var model string
var total, confirmed, fp, ungraded int
if err := frows.Scan(&model, &total, &confirmed, &fp, &ungraded); err != nil {
frows.Close()
return nil, err
}
m := get(model, "")
m.Findings, m.Confirmed, m.FalsePositive, m.Ungraded = total, confirmed, fp, ungraded
}
frows.Close()
// Confirmed-by-severity histogram (distinct findings).
srows, err := s.db.Query(`
SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
WHERE lg.is_real=1 AND lg.severity IS NOT NULL
GROUP BY r.model, lg.severity`)
if err != nil {
return nil, err
}
for srows.Next() {
var model, sev string
var n int
if err := srows.Scan(&model, &sev, &n); err != nil {
srows.Close()
return nil, err
}
get(model, "").BySeverity[sev] = n
}
srows.Close()
out := make([]ModelStat, 0, len(stats))
for _, m := range stats {
out = append(out, *m)
}
sort.Slice(out, func(i, j int) bool { return out[i].Model < out[j].Model })
return out, nil
}
func sortedSeverities() []string {
out := make([]string, 0, len(validSeverities))
for s := range validSeverities {
out = append(out, s)
}
sort.Strings(out)
return out
}
func nullStr(s string) any {
if s == "" {
return nil
}
return s
}