feat: PR filter — compare models on the same set of PRs

UI: a repo#pr multi-select (labeled with how many models ran each PR) scopes the whole table — runs, minutes, findings, points — to the chosen PRs, so a model with 2 runs can be fairly compared against one with 60. API: GET /scoreboard accepts ?repo= and ?pr= (repeatable or comma-list). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 22:55:43 -04:00
parent 2f003dd132
commit 1af115fdf1
6 changed files with 202 additions and 19 deletions
@@ -379,9 +379,42 @@ type ModelStat struct {
 	BySeverity    map[string]int `json:"by_severity"` // confirmed findings per severity
 }

+// ScoreboardFilter narrows the scoreboard to a repo and/or a set of PRs, so
+// models with very different run counts can be compared on the exact same work
+// (e.g. only the PRs every model reviewed). Zero value = no filtering.
+type ScoreboardFilter struct {
+	Repo string
+	PRs  []int
+}
+
+// conds returns SQL conditions (and their args) against the given repo/pr
+// column names — runs carry repo/pr directly, reports get them via findings.
+func (f ScoreboardFilter) conds(repoCol, prCol string) ([]string, []any) {
+	var conds []string
+	var args []any
+	if f.Repo != "" {
+		conds = append(conds, repoCol+" = ?")
+		args = append(args, f.Repo)
+	}
+	if len(f.PRs) > 0 {
+		conds = append(conds, prCol+" IN (?"+strings.Repeat(",?", len(f.PRs)-1)+")")
+		for _, p := range f.PRs {
+			args = append(args, p)
+		}
+	}
+	return conds, args
+}
+
+func whereClause(conds []string) string {
+	if len(conds) == 0 {
+		return ""
+	}
+	return " WHERE " + strings.Join(conds, " AND ")
+}
+
 // Scoreboard rolls runs + reports + latest grades up per model. All counts of
 // findings are DISTINCT by finding (a model re-reporting across runs counts once).
-func (s *Store) Scoreboard() ([]ModelStat, error) {
+func (s *Store) Scoreboard(f ScoreboardFilter) ([]ModelStat, error) {
 	stats := map[string]*ModelStat{}
 	get := func(model, provider string) *ModelStat {
 		m, ok := stats[model]
@@ -393,10 +426,11 @@ func (s *Store) Scoreboard() ([]ModelStat, error) {
 	}

 	// Runs: minutes + tokens + run counts.
+	runConds, runArgs := f.conds("repo", "pr")
 	rrows, err := s.db.Query(`
 SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
       COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
-FROM runs GROUP BY model, provider`)
+FROM runs`+whereClause(runConds)+` GROUP BY model, provider`, runArgs...)
 	if err != nil {
 		return nil, err
 	}
@@ -418,14 +452,17 @@ FROM runs GROUP BY model, provider`)
 	rrows.Close()

 	// Findings: distinct per model, split by latest-grade state.
+	findConds, findArgs := f.conds("fi.repo", "fi.pr")
 	frows, err := s.db.Query(`
 SELECT r.model,
       COUNT(DISTINCT r.finding_id),
       COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
       COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
       COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
-FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
-GROUP BY r.model`)
+FROM reports r
+JOIN findings fi ON fi.id = r.finding_id
+LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id`+whereClause(findConds)+`
+GROUP BY r.model`, findArgs...)
 	if err != nil {
 		return nil, err
 	}
@@ -442,11 +479,14 @@ GROUP BY r.model`)
 	frows.Close()

 	// Confirmed-by-severity histogram (distinct findings).
+	sevConds, sevArgs := f.conds("fi.repo", "fi.pr")
 	srows, err := s.db.Query(`
 SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
-FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
-WHERE lg.is_real=1 AND lg.severity IS NOT NULL
-GROUP BY r.model, lg.severity`)
+FROM reports r
+JOIN findings fi ON fi.id = r.finding_id
+JOIN latest_grades lg ON lg.finding_id = r.finding_id`+
+		whereClause(append(sevConds, "lg.is_real=1", "lg.severity IS NOT NULL"))+`
+GROUP BY r.model, lg.severity`, sevArgs...)
 	if err != nil {
 		return nil, err
 	}