feat: gadfly-reports — findings store + scoreboard daemon
SQLite-backed HTTP store for Gadfly review findings, per-review run timings, and human/Claude grades, with a points-free per-model scoreboard. Pure fact store: it computes no points or rankings (the dashboard maps severity->points client-side and retunes without re-scoring). Findings are content-addressed by location so cross-model reports collapse for consensus; one grade per finding, latest wins. Pure-Go SQLite (CGO-free) + Docker image CI + tests. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,447 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"database/sql"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// gadfly-reports stores only RAW review facts: which model reported which finding, how
|
||||
// long each model's review took, and a human/Claude grade (is_real + severity +
|
||||
// usefulness). It deliberately does NOT compute points or rankings — the
|
||||
// dashboard owns the scoring curve (severity -> points, value-per-minute), so it
|
||||
// can be retuned without re-scoring or migrating stored data. The severity
|
||||
// vocabulary below is the only scoring-related contract.
|
||||
|
||||
// validSeverities is the closed set a grade may assign to a REAL finding. The
|
||||
// client maps these to points however it likes (e.g. trivial=1 … critical=20).
|
||||
var validSeverities = map[string]bool{
|
||||
"trivial": true,
|
||||
"small": true,
|
||||
"medium": true,
|
||||
"high": true,
|
||||
"critical": true,
|
||||
}
|
||||
|
||||
const schema = `
|
||||
CREATE TABLE IF NOT EXISTS runs (
|
||||
run_id TEXT PRIMARY KEY,
|
||||
repo TEXT NOT NULL,
|
||||
pr INTEGER NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
provider TEXT NOT NULL,
|
||||
lenses INTEGER NOT NULL DEFAULT 0,
|
||||
duration_secs REAL NOT NULL DEFAULT 0,
|
||||
input_tokens INTEGER,
|
||||
output_tokens INTEGER,
|
||||
cost_usd REAL,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS findings (
|
||||
id TEXT PRIMARY KEY,
|
||||
repo TEXT NOT NULL,
|
||||
pr INTEGER NOT NULL,
|
||||
lens TEXT NOT NULL,
|
||||
file TEXT,
|
||||
line INTEGER,
|
||||
title TEXT NOT NULL,
|
||||
first_seen TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS reports (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
finding_id TEXT NOT NULL,
|
||||
run_id TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
provider TEXT NOT NULL,
|
||||
raw_severity TEXT,
|
||||
detail TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
UNIQUE(finding_id, run_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_reports_finding ON reports(finding_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_reports_model ON reports(model);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS grades (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
finding_id TEXT NOT NULL,
|
||||
is_real INTEGER NOT NULL,
|
||||
severity TEXT,
|
||||
usefulness INTEGER,
|
||||
notes TEXT,
|
||||
grader TEXT,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_grades_finding ON grades(finding_id);
|
||||
|
||||
-- latest_grades: the most recent grade per finding (grade history is kept; the
|
||||
-- latest wins). Used by every read path so a re-grade supersedes the old one.
|
||||
CREATE VIEW IF NOT EXISTS latest_grades AS
|
||||
SELECT g.* FROM grades g
|
||||
JOIN (SELECT finding_id, MAX(id) AS max_id FROM grades GROUP BY finding_id) m
|
||||
ON g.id = m.max_id;
|
||||
`
|
||||
|
||||
// Store is the SQLite-backed fact store.
|
||||
type Store struct{ db *sql.DB }
|
||||
|
||||
// Open opens (creating if needed) the SQLite database at path and applies the
|
||||
// schema. WAL + a busy timeout keep the single-writer daemon honest under the
|
||||
// occasional concurrent reader.
|
||||
func Open(path string) (*Store, error) {
|
||||
db, err := sql.Open("sqlite", "file:"+path+"?_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open %s: %w", path, err)
|
||||
}
|
||||
// modernc's pure-Go driver is happiest with a single writer connection.
|
||||
db.SetMaxOpenConns(1)
|
||||
if _, err := db.Exec(schema); err != nil {
|
||||
db.Close()
|
||||
return nil, fmt.Errorf("migrate: %w", err)
|
||||
}
|
||||
return &Store{db: db}, nil
|
||||
}
|
||||
|
||||
func (s *Store) Close() error { return s.db.Close() }
|
||||
|
||||
func now() string { return time.Now().UTC().Format(time.RFC3339) }
|
||||
|
||||
// findingID content-addresses a finding by location, NOT by wording, so the same
|
||||
// issue raised by different models (or re-raised on a re-review) collapses to one
|
||||
// finding with many reports — that collapse is what makes cross-model consensus
|
||||
// and per-model precision measurable. Title is intentionally excluded.
|
||||
func findingID(repo string, pr int, lens, file string, line int) string {
|
||||
key := fmt.Sprintf("%s|%d|%s|%s|%d",
|
||||
strings.TrimSpace(repo), pr, strings.ToLower(strings.TrimSpace(lens)),
|
||||
strings.TrimSpace(file), line)
|
||||
sum := sha256.Sum256([]byte(key))
|
||||
return hex.EncodeToString(sum[:])[:16]
|
||||
}
|
||||
|
||||
// Run is one model's review of one PR — the unit run.sh times.
|
||||
type Run struct {
|
||||
RunID string `json:"run_id"`
|
||||
Repo string `json:"repo"`
|
||||
PR int `json:"pr"`
|
||||
Model string `json:"model"`
|
||||
Provider string `json:"provider"`
|
||||
Lenses int `json:"lenses"`
|
||||
DurationSecs float64 `json:"duration_secs"`
|
||||
InputTokens *int64 `json:"input_tokens,omitempty"`
|
||||
OutputTokens *int64 `json:"output_tokens,omitempty"`
|
||||
CostUSD *float64 `json:"cost_usd,omitempty"`
|
||||
}
|
||||
|
||||
// AddRun upserts a run by run_id (a re-posted run overwrites timing/tokens).
|
||||
func (s *Store) AddRun(r Run) error {
|
||||
if strings.TrimSpace(r.RunID) == "" || strings.TrimSpace(r.Model) == "" {
|
||||
return fmt.Errorf("run_id and model are required")
|
||||
}
|
||||
_, err := s.db.Exec(`
|
||||
INSERT INTO runs (run_id, repo, pr, model, provider, lenses, duration_secs, input_tokens, output_tokens, cost_usd, created_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(run_id) DO UPDATE SET
|
||||
repo=excluded.repo, pr=excluded.pr, model=excluded.model, provider=excluded.provider,
|
||||
lenses=excluded.lenses, duration_secs=excluded.duration_secs,
|
||||
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens, cost_usd=excluded.cost_usd`,
|
||||
r.RunID, r.Repo, r.PR, r.Model, r.Provider, r.Lenses, r.DurationSecs,
|
||||
r.InputTokens, r.OutputTokens, r.CostUSD, now())
|
||||
return err
|
||||
}
|
||||
|
||||
// ReportIn is one finding as a single model reported it.
|
||||
type ReportIn struct {
|
||||
Repo string `json:"repo"`
|
||||
PR int `json:"pr"`
|
||||
Lens string `json:"lens"`
|
||||
File string `json:"file"`
|
||||
Line int `json:"line"`
|
||||
Title string `json:"title"`
|
||||
Model string `json:"model"`
|
||||
Provider string `json:"provider"`
|
||||
RunID string `json:"run_id"`
|
||||
RawSeverity string `json:"raw_severity"`
|
||||
Detail string `json:"detail"`
|
||||
}
|
||||
|
||||
// AddReports records a batch of findings: each upserts its (content-addressed)
|
||||
// finding row and adds this model's report of it. Returns the finding id per
|
||||
// input (same order). A model re-reporting the same finding in the same run is a
|
||||
// no-op (UNIQUE finding_id,run_id).
|
||||
func (s *Store) AddReports(in []ReportIn) ([]string, error) {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
ts := now()
|
||||
ids := make([]string, len(in))
|
||||
for i, r := range in {
|
||||
if strings.TrimSpace(r.Title) == "" || strings.TrimSpace(r.Lens) == "" {
|
||||
return nil, fmt.Errorf("report %d: lens and title are required", i)
|
||||
}
|
||||
id := findingID(r.Repo, r.PR, r.Lens, r.File, r.Line)
|
||||
ids[i] = id
|
||||
if _, err := tx.Exec(`
|
||||
INSERT INTO findings (id, repo, pr, lens, file, line, title, first_seen)
|
||||
VALUES (?,?,?,?,?,?,?,?) ON CONFLICT(id) DO NOTHING`,
|
||||
id, r.Repo, r.PR, strings.ToLower(strings.TrimSpace(r.Lens)), r.File, r.Line, r.Title, ts); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if _, err := tx.Exec(`
|
||||
INSERT INTO reports (finding_id, run_id, model, provider, raw_severity, detail, created_at)
|
||||
VALUES (?,?,?,?,?,?,?) ON CONFLICT(finding_id, run_id) DO NOTHING`,
|
||||
id, r.RunID, r.Model, r.Provider, r.RawSeverity, r.Detail, ts); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return ids, tx.Commit()
|
||||
}
|
||||
|
||||
// Grade is a triage verdict on a finding. Severity is required when is_real and
|
||||
// must be one of validSeverities; it is cleared when !is_real. No points here —
|
||||
// the client maps severity -> points.
|
||||
type Grade struct {
|
||||
FindingID string `json:"finding_id"`
|
||||
IsReal bool `json:"is_real"`
|
||||
Severity string `json:"severity,omitempty"`
|
||||
Usefulness *int `json:"usefulness,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
Grader string `json:"grader,omitempty"`
|
||||
}
|
||||
|
||||
// AddGrade appends a grade (history is kept; latest wins).
|
||||
func (s *Store) AddGrade(g Grade) error {
|
||||
if strings.TrimSpace(g.FindingID) == "" {
|
||||
return fmt.Errorf("finding_id is required")
|
||||
}
|
||||
var exists bool
|
||||
if err := s.db.QueryRow(`SELECT EXISTS(SELECT 1 FROM findings WHERE id=?)`, g.FindingID).Scan(&exists); err != nil {
|
||||
return err
|
||||
}
|
||||
if !exists {
|
||||
return fmt.Errorf("unknown finding_id %q", g.FindingID)
|
||||
}
|
||||
sev := strings.ToLower(strings.TrimSpace(g.Severity))
|
||||
if g.IsReal {
|
||||
if !validSeverities[sev] {
|
||||
return fmt.Errorf("severity %q invalid for a real finding (want one of: %s)", g.Severity, strings.Join(sortedSeverities(), ", "))
|
||||
}
|
||||
} else {
|
||||
sev = "" // a false positive carries no severity
|
||||
}
|
||||
if g.Usefulness != nil && (*g.Usefulness < 1 || *g.Usefulness > 5) {
|
||||
return fmt.Errorf("usefulness must be 1..5, got %d", *g.Usefulness)
|
||||
}
|
||||
_, err := s.db.Exec(`
|
||||
INSERT INTO grades (finding_id, is_real, severity, usefulness, notes, grader, created_at)
|
||||
VALUES (?,?,?,?,?,?,?)`,
|
||||
g.FindingID, g.IsReal, nullStr(sev), g.Usefulness, nullStr(g.Notes), nullStr(g.Grader), now())
|
||||
return err
|
||||
}
|
||||
|
||||
// ExportRow is one report joined with its finding, run timing, and latest grade
|
||||
// — the flat shape a dashboard consumes. Grade fields are nil/empty until graded.
|
||||
type ExportRow struct {
|
||||
FindingID string `json:"finding_id"`
|
||||
Repo string `json:"repo"`
|
||||
PR int `json:"pr"`
|
||||
Lens string `json:"lens"`
|
||||
File string `json:"file,omitempty"`
|
||||
Line int `json:"line,omitempty"`
|
||||
Title string `json:"title"`
|
||||
Model string `json:"model"`
|
||||
Provider string `json:"provider,omitempty"`
|
||||
RunID string `json:"run_id"`
|
||||
RawSeverity string `json:"raw_severity,omitempty"`
|
||||
ReportedAt string `json:"reported_at"`
|
||||
DurationSecs float64 `json:"duration_secs"`
|
||||
InputTokens *int64 `json:"input_tokens,omitempty"`
|
||||
OutputTokens *int64 `json:"output_tokens,omitempty"`
|
||||
Graded bool `json:"graded"`
|
||||
IsReal *bool `json:"is_real,omitempty"`
|
||||
Severity string `json:"severity,omitempty"`
|
||||
Usefulness *int `json:"usefulness,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
Grader string `json:"grader,omitempty"`
|
||||
GradedAt string `json:"graded_at,omitempty"`
|
||||
}
|
||||
|
||||
// Export returns every report joined with finding, run timing, and latest grade,
|
||||
// oldest first. The dashboard does all weighting from these raw rows.
|
||||
func (s *Store) Export() ([]ExportRow, error) {
|
||||
rows, err := s.db.Query(`
|
||||
SELECT r.finding_id, f.repo, f.pr, f.lens, f.file, f.line, f.title,
|
||||
r.model, r.provider, r.run_id, r.raw_severity, r.created_at,
|
||||
COALESCE(ru.duration_secs, 0), ru.input_tokens, ru.output_tokens,
|
||||
lg.is_real, lg.severity, lg.usefulness, lg.notes, lg.grader, lg.created_at
|
||||
FROM reports r
|
||||
JOIN findings f ON f.id = r.finding_id
|
||||
LEFT JOIN runs ru ON ru.run_id = r.run_id
|
||||
LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
||||
ORDER BY r.created_at, r.id`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []ExportRow
|
||||
for rows.Next() {
|
||||
var e ExportRow
|
||||
var file, rawSev, sev, notes, grader, gradedAt sql.NullString
|
||||
var line sql.NullInt64
|
||||
var isReal sql.NullBool
|
||||
var useful sql.NullInt64
|
||||
if err := rows.Scan(&e.FindingID, &e.Repo, &e.PR, &e.Lens, &file, &line, &e.Title,
|
||||
&e.Model, &e.Provider, &e.RunID, &rawSev, &e.ReportedAt,
|
||||
&e.DurationSecs, &e.InputTokens, &e.OutputTokens,
|
||||
&isReal, &sev, &useful, ¬es, &grader, &gradedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
e.File, e.Line = file.String, int(line.Int64)
|
||||
e.RawSeverity = rawSev.String
|
||||
if isReal.Valid {
|
||||
e.Graded = true
|
||||
v := isReal.Bool
|
||||
e.IsReal = &v
|
||||
e.Severity, e.Notes, e.Grader, e.GradedAt = sev.String, notes.String, grader.String, gradedAt.String
|
||||
if useful.Valid {
|
||||
u := int(useful.Int64)
|
||||
e.Usefulness = &u
|
||||
}
|
||||
}
|
||||
out = append(out, e)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// ModelStat is the per-model rollup the scoreboard returns. It is intentionally
|
||||
// POINTS-FREE: raw minutes/tokens and a confirmed-by-severity histogram, so the
|
||||
// client applies its own weights for points and value-per-minute/token.
|
||||
type ModelStat struct {
|
||||
Model string `json:"model"`
|
||||
Provider string `json:"provider,omitempty"`
|
||||
Runs int `json:"runs"`
|
||||
Minutes float64 `json:"minutes"`
|
||||
InputTokens int64 `json:"input_tokens"`
|
||||
OutputTokens int64 `json:"output_tokens"`
|
||||
Findings int `json:"findings"`
|
||||
Confirmed int `json:"confirmed"`
|
||||
FalsePositive int `json:"false_positive"`
|
||||
Ungraded int `json:"ungraded"`
|
||||
BySeverity map[string]int `json:"by_severity"` // confirmed findings per severity
|
||||
}
|
||||
|
||||
// Scoreboard rolls runs + reports + latest grades up per model. All counts of
|
||||
// findings are DISTINCT by finding (a model re-reporting across runs counts once).
|
||||
func (s *Store) Scoreboard() ([]ModelStat, error) {
|
||||
stats := map[string]*ModelStat{}
|
||||
get := func(model, provider string) *ModelStat {
|
||||
m, ok := stats[model]
|
||||
if !ok {
|
||||
m = &ModelStat{Model: model, Provider: provider, BySeverity: map[string]int{}}
|
||||
stats[model] = m
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Runs: minutes + tokens + run counts.
|
||||
rrows, err := s.db.Query(`
|
||||
SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
|
||||
COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
|
||||
FROM runs GROUP BY model, provider`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for rrows.Next() {
|
||||
var model, provider string
|
||||
var runs int
|
||||
var dur float64
|
||||
var in, out int64
|
||||
if err := rrows.Scan(&model, &provider, &runs, &dur, &in, &out); err != nil {
|
||||
rrows.Close()
|
||||
return nil, err
|
||||
}
|
||||
m := get(model, provider)
|
||||
m.Runs += runs
|
||||
m.Minutes += dur / 60
|
||||
m.InputTokens += in
|
||||
m.OutputTokens += out
|
||||
}
|
||||
rrows.Close()
|
||||
|
||||
// Findings: distinct per model, split by latest-grade state.
|
||||
frows, err := s.db.Query(`
|
||||
SELECT r.model,
|
||||
COUNT(DISTINCT r.finding_id),
|
||||
COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
|
||||
COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
|
||||
COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
|
||||
FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
||||
GROUP BY r.model`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for frows.Next() {
|
||||
var model string
|
||||
var total, confirmed, fp, ungraded int
|
||||
if err := frows.Scan(&model, &total, &confirmed, &fp, &ungraded); err != nil {
|
||||
frows.Close()
|
||||
return nil, err
|
||||
}
|
||||
m := get(model, "")
|
||||
m.Findings, m.Confirmed, m.FalsePositive, m.Ungraded = total, confirmed, fp, ungraded
|
||||
}
|
||||
frows.Close()
|
||||
|
||||
// Confirmed-by-severity histogram (distinct findings).
|
||||
srows, err := s.db.Query(`
|
||||
SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
|
||||
FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
||||
WHERE lg.is_real=1 AND lg.severity IS NOT NULL
|
||||
GROUP BY r.model, lg.severity`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for srows.Next() {
|
||||
var model, sev string
|
||||
var n int
|
||||
if err := srows.Scan(&model, &sev, &n); err != nil {
|
||||
srows.Close()
|
||||
return nil, err
|
||||
}
|
||||
get(model, "").BySeverity[sev] = n
|
||||
}
|
||||
srows.Close()
|
||||
|
||||
out := make([]ModelStat, 0, len(stats))
|
||||
for _, m := range stats {
|
||||
out = append(out, *m)
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].Model < out[j].Model })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func sortedSeverities() []string {
|
||||
out := make([]string, 0, len(validSeverities))
|
||||
for s := range validSeverities {
|
||||
out = append(out, s)
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func nullStr(s string) any {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
return s
|
||||
}
|
||||
Reference in New Issue
Block a user