ddcf42a3ce
SQLite-backed HTTP store for Gadfly review findings, per-review run timings, and human/Claude grades, with a points-free per-model scoreboard. Pure fact store: it computes no points or rankings (the dashboard maps severity->points client-side and retunes without re-scoring). Findings are content-addressed by location so cross-model reports collapse for consensus; one grade per finding, latest wins. Pure-Go SQLite (CGO-free) + Docker image CI + tests. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
448 lines
15 KiB
Go
448 lines
15 KiB
Go
package main
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"database/sql"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
_ "modernc.org/sqlite"
|
|
)
|
|
|
|
// gadfly-reports stores only RAW review facts: which model reported which finding, how
|
|
// long each model's review took, and a human/Claude grade (is_real + severity +
|
|
// usefulness). It deliberately does NOT compute points or rankings — the
|
|
// dashboard owns the scoring curve (severity -> points, value-per-minute), so it
|
|
// can be retuned without re-scoring or migrating stored data. The severity
|
|
// vocabulary below is the only scoring-related contract.
|
|
|
|
// validSeverities is the closed set a grade may assign to a REAL finding. The
|
|
// client maps these to points however it likes (e.g. trivial=1 … critical=20).
|
|
var validSeverities = map[string]bool{
|
|
"trivial": true,
|
|
"small": true,
|
|
"medium": true,
|
|
"high": true,
|
|
"critical": true,
|
|
}
|
|
|
|
const schema = `
|
|
CREATE TABLE IF NOT EXISTS runs (
|
|
run_id TEXT PRIMARY KEY,
|
|
repo TEXT NOT NULL,
|
|
pr INTEGER NOT NULL,
|
|
model TEXT NOT NULL,
|
|
provider TEXT NOT NULL,
|
|
lenses INTEGER NOT NULL DEFAULT 0,
|
|
duration_secs REAL NOT NULL DEFAULT 0,
|
|
input_tokens INTEGER,
|
|
output_tokens INTEGER,
|
|
cost_usd REAL,
|
|
created_at TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS findings (
|
|
id TEXT PRIMARY KEY,
|
|
repo TEXT NOT NULL,
|
|
pr INTEGER NOT NULL,
|
|
lens TEXT NOT NULL,
|
|
file TEXT,
|
|
line INTEGER,
|
|
title TEXT NOT NULL,
|
|
first_seen TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS reports (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
finding_id TEXT NOT NULL,
|
|
run_id TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
provider TEXT NOT NULL,
|
|
raw_severity TEXT,
|
|
detail TEXT,
|
|
created_at TEXT NOT NULL,
|
|
UNIQUE(finding_id, run_id)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_reports_finding ON reports(finding_id);
|
|
CREATE INDEX IF NOT EXISTS idx_reports_model ON reports(model);
|
|
|
|
CREATE TABLE IF NOT EXISTS grades (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
finding_id TEXT NOT NULL,
|
|
is_real INTEGER NOT NULL,
|
|
severity TEXT,
|
|
usefulness INTEGER,
|
|
notes TEXT,
|
|
grader TEXT,
|
|
created_at TEXT NOT NULL
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_grades_finding ON grades(finding_id);
|
|
|
|
-- latest_grades: the most recent grade per finding (grade history is kept; the
|
|
-- latest wins). Used by every read path so a re-grade supersedes the old one.
|
|
CREATE VIEW IF NOT EXISTS latest_grades AS
|
|
SELECT g.* FROM grades g
|
|
JOIN (SELECT finding_id, MAX(id) AS max_id FROM grades GROUP BY finding_id) m
|
|
ON g.id = m.max_id;
|
|
`
|
|
|
|
// Store is the SQLite-backed fact store.
|
|
type Store struct{ db *sql.DB }
|
|
|
|
// Open opens (creating if needed) the SQLite database at path and applies the
|
|
// schema. WAL + a busy timeout keep the single-writer daemon honest under the
|
|
// occasional concurrent reader.
|
|
func Open(path string) (*Store, error) {
|
|
db, err := sql.Open("sqlite", "file:"+path+"?_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(on)")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open %s: %w", path, err)
|
|
}
|
|
// modernc's pure-Go driver is happiest with a single writer connection.
|
|
db.SetMaxOpenConns(1)
|
|
if _, err := db.Exec(schema); err != nil {
|
|
db.Close()
|
|
return nil, fmt.Errorf("migrate: %w", err)
|
|
}
|
|
return &Store{db: db}, nil
|
|
}
|
|
|
|
func (s *Store) Close() error { return s.db.Close() }
|
|
|
|
func now() string { return time.Now().UTC().Format(time.RFC3339) }
|
|
|
|
// findingID content-addresses a finding by location, NOT by wording, so the same
|
|
// issue raised by different models (or re-raised on a re-review) collapses to one
|
|
// finding with many reports — that collapse is what makes cross-model consensus
|
|
// and per-model precision measurable. Title is intentionally excluded.
|
|
func findingID(repo string, pr int, lens, file string, line int) string {
|
|
key := fmt.Sprintf("%s|%d|%s|%s|%d",
|
|
strings.TrimSpace(repo), pr, strings.ToLower(strings.TrimSpace(lens)),
|
|
strings.TrimSpace(file), line)
|
|
sum := sha256.Sum256([]byte(key))
|
|
return hex.EncodeToString(sum[:])[:16]
|
|
}
|
|
|
|
// Run is one model's review of one PR — the unit run.sh times.
|
|
type Run struct {
|
|
RunID string `json:"run_id"`
|
|
Repo string `json:"repo"`
|
|
PR int `json:"pr"`
|
|
Model string `json:"model"`
|
|
Provider string `json:"provider"`
|
|
Lenses int `json:"lenses"`
|
|
DurationSecs float64 `json:"duration_secs"`
|
|
InputTokens *int64 `json:"input_tokens,omitempty"`
|
|
OutputTokens *int64 `json:"output_tokens,omitempty"`
|
|
CostUSD *float64 `json:"cost_usd,omitempty"`
|
|
}
|
|
|
|
// AddRun upserts a run by run_id (a re-posted run overwrites timing/tokens).
|
|
func (s *Store) AddRun(r Run) error {
|
|
if strings.TrimSpace(r.RunID) == "" || strings.TrimSpace(r.Model) == "" {
|
|
return fmt.Errorf("run_id and model are required")
|
|
}
|
|
_, err := s.db.Exec(`
|
|
INSERT INTO runs (run_id, repo, pr, model, provider, lenses, duration_secs, input_tokens, output_tokens, cost_usd, created_at)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
|
ON CONFLICT(run_id) DO UPDATE SET
|
|
repo=excluded.repo, pr=excluded.pr, model=excluded.model, provider=excluded.provider,
|
|
lenses=excluded.lenses, duration_secs=excluded.duration_secs,
|
|
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens, cost_usd=excluded.cost_usd`,
|
|
r.RunID, r.Repo, r.PR, r.Model, r.Provider, r.Lenses, r.DurationSecs,
|
|
r.InputTokens, r.OutputTokens, r.CostUSD, now())
|
|
return err
|
|
}
|
|
|
|
// ReportIn is one finding as a single model reported it.
|
|
type ReportIn struct {
|
|
Repo string `json:"repo"`
|
|
PR int `json:"pr"`
|
|
Lens string `json:"lens"`
|
|
File string `json:"file"`
|
|
Line int `json:"line"`
|
|
Title string `json:"title"`
|
|
Model string `json:"model"`
|
|
Provider string `json:"provider"`
|
|
RunID string `json:"run_id"`
|
|
RawSeverity string `json:"raw_severity"`
|
|
Detail string `json:"detail"`
|
|
}
|
|
|
|
// AddReports records a batch of findings: each upserts its (content-addressed)
|
|
// finding row and adds this model's report of it. Returns the finding id per
|
|
// input (same order). A model re-reporting the same finding in the same run is a
|
|
// no-op (UNIQUE finding_id,run_id).
|
|
func (s *Store) AddReports(in []ReportIn) ([]string, error) {
|
|
tx, err := s.db.Begin()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer tx.Rollback()
|
|
|
|
ts := now()
|
|
ids := make([]string, len(in))
|
|
for i, r := range in {
|
|
if strings.TrimSpace(r.Title) == "" || strings.TrimSpace(r.Lens) == "" {
|
|
return nil, fmt.Errorf("report %d: lens and title are required", i)
|
|
}
|
|
id := findingID(r.Repo, r.PR, r.Lens, r.File, r.Line)
|
|
ids[i] = id
|
|
if _, err := tx.Exec(`
|
|
INSERT INTO findings (id, repo, pr, lens, file, line, title, first_seen)
|
|
VALUES (?,?,?,?,?,?,?,?) ON CONFLICT(id) DO NOTHING`,
|
|
id, r.Repo, r.PR, strings.ToLower(strings.TrimSpace(r.Lens)), r.File, r.Line, r.Title, ts); err != nil {
|
|
return nil, err
|
|
}
|
|
if _, err := tx.Exec(`
|
|
INSERT INTO reports (finding_id, run_id, model, provider, raw_severity, detail, created_at)
|
|
VALUES (?,?,?,?,?,?,?) ON CONFLICT(finding_id, run_id) DO NOTHING`,
|
|
id, r.RunID, r.Model, r.Provider, r.RawSeverity, r.Detail, ts); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return ids, tx.Commit()
|
|
}
|
|
|
|
// Grade is a triage verdict on a finding. Severity is required when is_real and
|
|
// must be one of validSeverities; it is cleared when !is_real. No points here —
|
|
// the client maps severity -> points.
|
|
type Grade struct {
|
|
FindingID string `json:"finding_id"`
|
|
IsReal bool `json:"is_real"`
|
|
Severity string `json:"severity,omitempty"`
|
|
Usefulness *int `json:"usefulness,omitempty"`
|
|
Notes string `json:"notes,omitempty"`
|
|
Grader string `json:"grader,omitempty"`
|
|
}
|
|
|
|
// AddGrade appends a grade (history is kept; latest wins).
|
|
func (s *Store) AddGrade(g Grade) error {
|
|
if strings.TrimSpace(g.FindingID) == "" {
|
|
return fmt.Errorf("finding_id is required")
|
|
}
|
|
var exists bool
|
|
if err := s.db.QueryRow(`SELECT EXISTS(SELECT 1 FROM findings WHERE id=?)`, g.FindingID).Scan(&exists); err != nil {
|
|
return err
|
|
}
|
|
if !exists {
|
|
return fmt.Errorf("unknown finding_id %q", g.FindingID)
|
|
}
|
|
sev := strings.ToLower(strings.TrimSpace(g.Severity))
|
|
if g.IsReal {
|
|
if !validSeverities[sev] {
|
|
return fmt.Errorf("severity %q invalid for a real finding (want one of: %s)", g.Severity, strings.Join(sortedSeverities(), ", "))
|
|
}
|
|
} else {
|
|
sev = "" // a false positive carries no severity
|
|
}
|
|
if g.Usefulness != nil && (*g.Usefulness < 1 || *g.Usefulness > 5) {
|
|
return fmt.Errorf("usefulness must be 1..5, got %d", *g.Usefulness)
|
|
}
|
|
_, err := s.db.Exec(`
|
|
INSERT INTO grades (finding_id, is_real, severity, usefulness, notes, grader, created_at)
|
|
VALUES (?,?,?,?,?,?,?)`,
|
|
g.FindingID, g.IsReal, nullStr(sev), g.Usefulness, nullStr(g.Notes), nullStr(g.Grader), now())
|
|
return err
|
|
}
|
|
|
|
// ExportRow is one report joined with its finding, run timing, and latest grade
|
|
// — the flat shape a dashboard consumes. Grade fields are nil/empty until graded.
|
|
type ExportRow struct {
|
|
FindingID string `json:"finding_id"`
|
|
Repo string `json:"repo"`
|
|
PR int `json:"pr"`
|
|
Lens string `json:"lens"`
|
|
File string `json:"file,omitempty"`
|
|
Line int `json:"line,omitempty"`
|
|
Title string `json:"title"`
|
|
Model string `json:"model"`
|
|
Provider string `json:"provider,omitempty"`
|
|
RunID string `json:"run_id"`
|
|
RawSeverity string `json:"raw_severity,omitempty"`
|
|
ReportedAt string `json:"reported_at"`
|
|
DurationSecs float64 `json:"duration_secs"`
|
|
InputTokens *int64 `json:"input_tokens,omitempty"`
|
|
OutputTokens *int64 `json:"output_tokens,omitempty"`
|
|
Graded bool `json:"graded"`
|
|
IsReal *bool `json:"is_real,omitempty"`
|
|
Severity string `json:"severity,omitempty"`
|
|
Usefulness *int `json:"usefulness,omitempty"`
|
|
Notes string `json:"notes,omitempty"`
|
|
Grader string `json:"grader,omitempty"`
|
|
GradedAt string `json:"graded_at,omitempty"`
|
|
}
|
|
|
|
// Export returns every report joined with finding, run timing, and latest grade,
|
|
// oldest first. The dashboard does all weighting from these raw rows.
|
|
func (s *Store) Export() ([]ExportRow, error) {
|
|
rows, err := s.db.Query(`
|
|
SELECT r.finding_id, f.repo, f.pr, f.lens, f.file, f.line, f.title,
|
|
r.model, r.provider, r.run_id, r.raw_severity, r.created_at,
|
|
COALESCE(ru.duration_secs, 0), ru.input_tokens, ru.output_tokens,
|
|
lg.is_real, lg.severity, lg.usefulness, lg.notes, lg.grader, lg.created_at
|
|
FROM reports r
|
|
JOIN findings f ON f.id = r.finding_id
|
|
LEFT JOIN runs ru ON ru.run_id = r.run_id
|
|
LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
|
ORDER BY r.created_at, r.id`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
var out []ExportRow
|
|
for rows.Next() {
|
|
var e ExportRow
|
|
var file, rawSev, sev, notes, grader, gradedAt sql.NullString
|
|
var line sql.NullInt64
|
|
var isReal sql.NullBool
|
|
var useful sql.NullInt64
|
|
if err := rows.Scan(&e.FindingID, &e.Repo, &e.PR, &e.Lens, &file, &line, &e.Title,
|
|
&e.Model, &e.Provider, &e.RunID, &rawSev, &e.ReportedAt,
|
|
&e.DurationSecs, &e.InputTokens, &e.OutputTokens,
|
|
&isReal, &sev, &useful, ¬es, &grader, &gradedAt); err != nil {
|
|
return nil, err
|
|
}
|
|
e.File, e.Line = file.String, int(line.Int64)
|
|
e.RawSeverity = rawSev.String
|
|
if isReal.Valid {
|
|
e.Graded = true
|
|
v := isReal.Bool
|
|
e.IsReal = &v
|
|
e.Severity, e.Notes, e.Grader, e.GradedAt = sev.String, notes.String, grader.String, gradedAt.String
|
|
if useful.Valid {
|
|
u := int(useful.Int64)
|
|
e.Usefulness = &u
|
|
}
|
|
}
|
|
out = append(out, e)
|
|
}
|
|
return out, rows.Err()
|
|
}
|
|
|
|
// ModelStat is the per-model rollup the scoreboard returns. It is intentionally
|
|
// POINTS-FREE: raw minutes/tokens and a confirmed-by-severity histogram, so the
|
|
// client applies its own weights for points and value-per-minute/token.
|
|
type ModelStat struct {
|
|
Model string `json:"model"`
|
|
Provider string `json:"provider,omitempty"`
|
|
Runs int `json:"runs"`
|
|
Minutes float64 `json:"minutes"`
|
|
InputTokens int64 `json:"input_tokens"`
|
|
OutputTokens int64 `json:"output_tokens"`
|
|
Findings int `json:"findings"`
|
|
Confirmed int `json:"confirmed"`
|
|
FalsePositive int `json:"false_positive"`
|
|
Ungraded int `json:"ungraded"`
|
|
BySeverity map[string]int `json:"by_severity"` // confirmed findings per severity
|
|
}
|
|
|
|
// Scoreboard rolls runs + reports + latest grades up per model. All counts of
|
|
// findings are DISTINCT by finding (a model re-reporting across runs counts once).
|
|
func (s *Store) Scoreboard() ([]ModelStat, error) {
|
|
stats := map[string]*ModelStat{}
|
|
get := func(model, provider string) *ModelStat {
|
|
m, ok := stats[model]
|
|
if !ok {
|
|
m = &ModelStat{Model: model, Provider: provider, BySeverity: map[string]int{}}
|
|
stats[model] = m
|
|
}
|
|
return m
|
|
}
|
|
|
|
// Runs: minutes + tokens + run counts.
|
|
rrows, err := s.db.Query(`
|
|
SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
|
|
COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
|
|
FROM runs GROUP BY model, provider`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for rrows.Next() {
|
|
var model, provider string
|
|
var runs int
|
|
var dur float64
|
|
var in, out int64
|
|
if err := rrows.Scan(&model, &provider, &runs, &dur, &in, &out); err != nil {
|
|
rrows.Close()
|
|
return nil, err
|
|
}
|
|
m := get(model, provider)
|
|
m.Runs += runs
|
|
m.Minutes += dur / 60
|
|
m.InputTokens += in
|
|
m.OutputTokens += out
|
|
}
|
|
rrows.Close()
|
|
|
|
// Findings: distinct per model, split by latest-grade state.
|
|
frows, err := s.db.Query(`
|
|
SELECT r.model,
|
|
COUNT(DISTINCT r.finding_id),
|
|
COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
|
|
COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
|
|
COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
|
|
FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
|
GROUP BY r.model`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for frows.Next() {
|
|
var model string
|
|
var total, confirmed, fp, ungraded int
|
|
if err := frows.Scan(&model, &total, &confirmed, &fp, &ungraded); err != nil {
|
|
frows.Close()
|
|
return nil, err
|
|
}
|
|
m := get(model, "")
|
|
m.Findings, m.Confirmed, m.FalsePositive, m.Ungraded = total, confirmed, fp, ungraded
|
|
}
|
|
frows.Close()
|
|
|
|
// Confirmed-by-severity histogram (distinct findings).
|
|
srows, err := s.db.Query(`
|
|
SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
|
|
FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
|
|
WHERE lg.is_real=1 AND lg.severity IS NOT NULL
|
|
GROUP BY r.model, lg.severity`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for srows.Next() {
|
|
var model, sev string
|
|
var n int
|
|
if err := srows.Scan(&model, &sev, &n); err != nil {
|
|
srows.Close()
|
|
return nil, err
|
|
}
|
|
get(model, "").BySeverity[sev] = n
|
|
}
|
|
srows.Close()
|
|
|
|
out := make([]ModelStat, 0, len(stats))
|
|
for _, m := range stats {
|
|
out = append(out, *m)
|
|
}
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Model < out[j].Model })
|
|
return out, nil
|
|
}
|
|
|
|
func sortedSeverities() []string {
|
|
out := make([]string, 0, len(validSeverities))
|
|
for s := range validSeverities {
|
|
out = append(out, s)
|
|
}
|
|
sort.Strings(out)
|
|
return out
|
|
}
|
|
|
|
func nullStr(s string) any {
|
|
if s == "" {
|
|
return nil
|
|
}
|
|
return s
|
|
}
|