feat: PR filter — compare models on the same set of PRs
Build & push image / build-and-push (push) Successful in 13s
CI / test (push) Successful in 9m51s

UI: a repo#pr multi-select (labeled with how many models ran each PR)
scopes the whole table — runs, minutes, findings, points — to the chosen
PRs, so a model with 2 runs can be fairly compared against one with 60.
API: GET /scoreboard accepts ?repo= and ?pr= (repeatable or comma-list).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 22:55:43 -04:00
parent 2f003dd132
commit 1af115fdf1
6 changed files with 202 additions and 19 deletions
+47 -7
View File
@@ -379,9 +379,42 @@ type ModelStat struct {
BySeverity map[string]int `json:"by_severity"` // confirmed findings per severity
}
// ScoreboardFilter narrows the scoreboard to a repo and/or a set of PRs, so
// models with very different run counts can be compared on the exact same work
// (e.g. only the PRs every model reviewed). Zero value = no filtering.
type ScoreboardFilter struct {
Repo string
PRs []int
}
// conds returns SQL conditions (and their args) against the given repo/pr
// column names — runs carry repo/pr directly, reports get them via findings.
func (f ScoreboardFilter) conds(repoCol, prCol string) ([]string, []any) {
var conds []string
var args []any
if f.Repo != "" {
conds = append(conds, repoCol+" = ?")
args = append(args, f.Repo)
}
if len(f.PRs) > 0 {
conds = append(conds, prCol+" IN (?"+strings.Repeat(",?", len(f.PRs)-1)+")")
for _, p := range f.PRs {
args = append(args, p)
}
}
return conds, args
}
func whereClause(conds []string) string {
if len(conds) == 0 {
return ""
}
return " WHERE " + strings.Join(conds, " AND ")
}
// Scoreboard rolls runs + reports + latest grades up per model. All counts of
// findings are DISTINCT by finding (a model re-reporting across runs counts once).
func (s *Store) Scoreboard() ([]ModelStat, error) {
func (s *Store) Scoreboard(f ScoreboardFilter) ([]ModelStat, error) {
stats := map[string]*ModelStat{}
get := func(model, provider string) *ModelStat {
m, ok := stats[model]
@@ -393,10 +426,11 @@ func (s *Store) Scoreboard() ([]ModelStat, error) {
}
// Runs: minutes + tokens + run counts.
runConds, runArgs := f.conds("repo", "pr")
rrows, err := s.db.Query(`
SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
FROM runs GROUP BY model, provider`)
FROM runs`+whereClause(runConds)+` GROUP BY model, provider`, runArgs...)
if err != nil {
return nil, err
}
@@ -418,14 +452,17 @@ FROM runs GROUP BY model, provider`)
rrows.Close()
// Findings: distinct per model, split by latest-grade state.
findConds, findArgs := f.conds("fi.repo", "fi.pr")
frows, err := s.db.Query(`
SELECT r.model,
COUNT(DISTINCT r.finding_id),
COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
GROUP BY r.model`)
FROM reports r
JOIN findings fi ON fi.id = r.finding_id
LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id`+whereClause(findConds)+`
GROUP BY r.model`, findArgs...)
if err != nil {
return nil, err
}
@@ -442,11 +479,14 @@ GROUP BY r.model`)
frows.Close()
// Confirmed-by-severity histogram (distinct findings).
sevConds, sevArgs := f.conds("fi.repo", "fi.pr")
srows, err := s.db.Query(`
SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
WHERE lg.is_real=1 AND lg.severity IS NOT NULL
GROUP BY r.model, lg.severity`)
FROM reports r
JOIN findings fi ON fi.id = r.finding_id
JOIN latest_grades lg ON lg.finding_id = r.finding_id`+
whereClause(append(sevConds, "lg.is_real=1", "lg.severity IS NOT NULL"))+`
GROUP BY r.model, lg.severity`, sevArgs...)
if err != nil {
return nil, err
}