feat: PR filter — compare models on the same set of PRs

UI: a repo#pr multi-select (labeled with how many models ran each PR) scopes the whole table — runs, minutes, findings, points — to the chosen PRs, so a model with 2 runs can be fairly compared against one with 60. API: GET /scoreboard accepts ?repo= and ?pr= (repeatable or comma-list). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 22:55:43 -04:00
parent 2f003dd132
commit 1af115fdf1
6 changed files with 202 additions and 19 deletions
@@ -106,7 +106,7 @@ against reviews that take minutes.
 | `POST /findings/{id}/grade` | `{is_real, severity?, usefulness?, notes?, grader?}` | record a triage grade |
 | `GET /export` | — | flat report×finding×run×latest-grade rows — the dashboard feed |
 | `GET /runs` | — | list all runs (timing/tokens), oldest first |
-| `GET /scoreboard` | — | points-free per-model rollup |
+| `GET /scoreboard` | `?repo=<repo>` `&pr=<n>` (repeatable or comma-list, e.g. `?pr=10,11`) | points-free per-model rollup, optionally narrowed to specific PRs so models are compared on the same work |

 `POST /runs` body: `{run_id, repo, pr, model, provider, lenses, duration_secs, input_tokens?, output_tokens?, cost_usd?}`
 (re-posting the same `run_id` updates it).
@@ -138,6 +138,11 @@ ungraded, points, **points-per-minute**, points-per-run, by-severity — with **
 (date range, repo, provider, model, lens, grade/severity), free-text search, and a click-to-scope
 findings detail table.

+Comparisons can be scoped to **specific PRs**: a multi-select lists every `repo#pr` with how many
+models ran it (`steve/x#12 · 3/5 models`) — pick the PRs you want and the entire table (runs,
+minutes, findings, points) counts only those, so a model with 2 runs can be compared against one
+with 60 on exactly the work you choose.
+
 True to the store's "no points" rule, **scoring lives in the browser**: the page has an editable
 points curve (default `trivial=1, small=3, medium=5, high=8, critical=20`) and computes
 `points = Σ weight[severity]·count` and `value/min = points / minutes` on the fly — retune it without
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"log"
 	"net/http"
+	"strconv"
 	"strings"
 )

@@ -21,7 +22,9 @@ import (
 //	GET  /healthz     liveness                                         (public)
 //	GET  /runs        list all runs (timing/tokens), oldest first
 //	GET  /export      flat report×finding×grade rows (the dashboard feed)
-//	GET  /scoreboard  points-free per-model rollup
+//	GET  /scoreboard  points-free per-model rollup; ?repo= and ?pr= (repeatable
+//	                  or comma-list) narrow it to specific PRs so models are
+//	                  compared on the same work
 //	POST /runs                     upsert one run (model review of a PR; timing/tokens)
 //	POST /reports                  record a batch of findings + this model's reports
 //	POST /findings/{id}/grade      record a triage grade (is_real, severity, …)
@@ -79,8 +82,24 @@ func newServer(store *Store, token string) http.Handler {
 		writeJSON(w, http.StatusOK, rows)
 	})

-	mux.HandleFunc("GET /scoreboard", func(w http.ResponseWriter, _ *http.Request) {
-		stats, err := store.Scoreboard()
+	mux.HandleFunc("GET /scoreboard", func(w http.ResponseWriter, r *http.Request) {
+		f := ScoreboardFilter{Repo: r.URL.Query().Get("repo")}
+		// pr is repeatable and accepts comma lists: ?pr=1&pr=2 or ?pr=1,2
+		for _, v := range r.URL.Query()["pr"] {
+			for part := range strings.SplitSeq(v, ",") {
+				part = strings.TrimSpace(part)
+				if part == "" {
+					continue
+				}
+				n, err := strconv.Atoi(part)
+				if err != nil {
+					writeErr(w, http.StatusBadRequest, errors.New("invalid pr number: "+part))
+					return
+				}
+				f.PRs = append(f.PRs, n)
+			}
+		}
+		stats, err := store.Scoreboard(f)
 		if err != nil {
 			writeErr(w, http.StatusInternalServerError, err)
 			return
@@ -72,6 +72,33 @@ func TestServerEndToEnd(t *testing.T) {
 	}
 }

+// TestScoreboardQueryFilter: ?repo= and ?pr= narrow the scoreboard; a bad pr is a 400.
+func TestScoreboardQueryFilter(t *testing.T) {
+	srv := testServer(t, "")
+
+	post(t, srv, "", "/runs", Run{RunID: "r1", Repo: "r", PR: 1, Model: "m", Provider: "p", DurationSecs: 60})
+	post(t, srv, "", "/runs", Run{RunID: "r2", Repo: "r", PR: 2, Model: "m", Provider: "p", DurationSecs: 60})
+	post(t, srv, "", "/runs", Run{RunID: "r3", Repo: "other", PR: 1, Model: "m", Provider: "p", DurationSecs: 60})
+
+	resp := mustGet(t, srv, "", "/scoreboard?repo=r&pr=1,2")
+	var board []ModelStat
+	json.NewDecoder(resp.Body).Decode(&board)
+	if len(board) != 1 || board[0].Runs != 2 {
+		t.Fatalf("filtered scoreboard: %+v, want 2 runs (repo 'other' excluded)", board)
+	}
+
+	resp = mustGet(t, srv, "", "/scoreboard?pr=1")
+	board = nil
+	json.NewDecoder(resp.Body).Decode(&board)
+	if len(board) != 1 || board[0].Runs != 2 {
+		t.Fatalf("pr-only filter: %+v, want 2 runs (both repos' PR 1)", board)
+	}
+
+	if resp := mustGet(t, srv, "", "/scoreboard?pr=abc"); resp.StatusCode != http.StatusBadRequest {
+		t.Errorf("GET /scoreboard?pr=abc = %d, want 400", resp.StatusCode)
+	}
+}
+
 // TestServerAuth: a set token gates writes but leaves /healthz open.
 func TestServerAuth(t *testing.T) {
 	srv := testServer(t, "secret")
@@ -379,9 +379,42 @@ type ModelStat struct {
 	BySeverity    map[string]int `json:"by_severity"` // confirmed findings per severity
 }

+// ScoreboardFilter narrows the scoreboard to a repo and/or a set of PRs, so
+// models with very different run counts can be compared on the exact same work
+// (e.g. only the PRs every model reviewed). Zero value = no filtering.
+type ScoreboardFilter struct {
+	Repo string
+	PRs  []int
+}
+
+// conds returns SQL conditions (and their args) against the given repo/pr
+// column names — runs carry repo/pr directly, reports get them via findings.
+func (f ScoreboardFilter) conds(repoCol, prCol string) ([]string, []any) {
+	var conds []string
+	var args []any
+	if f.Repo != "" {
+		conds = append(conds, repoCol+" = ?")
+		args = append(args, f.Repo)
+	}
+	if len(f.PRs) > 0 {
+		conds = append(conds, prCol+" IN (?"+strings.Repeat(",?", len(f.PRs)-1)+")")
+		for _, p := range f.PRs {
+			args = append(args, p)
+		}
+	}
+	return conds, args
+}
+
+func whereClause(conds []string) string {
+	if len(conds) == 0 {
+		return ""
+	}
+	return " WHERE " + strings.Join(conds, " AND ")
+}
+
 // Scoreboard rolls runs + reports + latest grades up per model. All counts of
 // findings are DISTINCT by finding (a model re-reporting across runs counts once).
-func (s *Store) Scoreboard() ([]ModelStat, error) {
+func (s *Store) Scoreboard(f ScoreboardFilter) ([]ModelStat, error) {
 	stats := map[string]*ModelStat{}
 	get := func(model, provider string) *ModelStat {
 		m, ok := stats[model]
@@ -393,10 +426,11 @@ func (s *Store) Scoreboard() ([]ModelStat, error) {
 	}

 	// Runs: minutes + tokens + run counts.
+	runConds, runArgs := f.conds("repo", "pr")
 	rrows, err := s.db.Query(`
 SELECT model, provider, COUNT(*), COALESCE(SUM(duration_secs),0),
       COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0)
-FROM runs GROUP BY model, provider`)
+FROM runs`+whereClause(runConds)+` GROUP BY model, provider`, runArgs...)
 	if err != nil {
 		return nil, err
 	}
@@ -418,14 +452,17 @@ FROM runs GROUP BY model, provider`)
 	rrows.Close()

 	// Findings: distinct per model, split by latest-grade state.
+	findConds, findArgs := f.conds("fi.repo", "fi.pr")
 	frows, err := s.db.Query(`
 SELECT r.model,
       COUNT(DISTINCT r.finding_id),
       COUNT(DISTINCT CASE WHEN lg.is_real=1 THEN r.finding_id END),
       COUNT(DISTINCT CASE WHEN lg.is_real=0 THEN r.finding_id END),
       COUNT(DISTINCT CASE WHEN lg.is_real IS NULL THEN r.finding_id END)
-FROM reports r LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id
-GROUP BY r.model`)
+FROM reports r
+JOIN findings fi ON fi.id = r.finding_id
+LEFT JOIN latest_grades lg ON lg.finding_id = r.finding_id`+whereClause(findConds)+`
+GROUP BY r.model`, findArgs...)
 	if err != nil {
 		return nil, err
 	}
@@ -442,11 +479,14 @@ GROUP BY r.model`)
 	frows.Close()

 	// Confirmed-by-severity histogram (distinct findings).
+	sevConds, sevArgs := f.conds("fi.repo", "fi.pr")
 	srows, err := s.db.Query(`
 SELECT r.model, lg.severity, COUNT(DISTINCT r.finding_id)
-FROM reports r JOIN latest_grades lg ON lg.finding_id = r.finding_id
-WHERE lg.is_real=1 AND lg.severity IS NOT NULL
-GROUP BY r.model, lg.severity`)
+FROM reports r
+JOIN findings fi ON fi.id = r.finding_id
+JOIN latest_grades lg ON lg.finding_id = r.finding_id`+
+		whereClause(append(sevConds, "lg.is_real=1", "lg.severity IS NOT NULL"))+`
+GROUP BY r.model, lg.severity`, sevArgs...)
 	if err != nil {
 		return nil, err
 	}
@@ -46,7 +46,7 @@ func TestConsensusAndGrade(t *testing.T) {
 		t.Fatal(err)
 	}

-	board, err := s.Scoreboard()
+	board, err := s.Scoreboard(ScoreboardFilter{})
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -88,7 +88,7 @@ func TestLatestGradeWins(t *testing.T) {
 	if err := s.AddGrade(Grade{FindingID: id, IsReal: false}); err != nil { // re-graded as a false positive
 		t.Fatal(err)
 	}
-	board, _ := s.Scoreboard()
+	board, _ := s.Scoreboard(ScoreboardFilter{})
 	m := board[0]
 	if m.Confirmed != 0 || m.FalsePositive != 1 || m.BySeverity["critical"] != 0 {
 		t.Errorf("after re-grade: confirmed=%d fp=%d critical=%d, want 0/1/0", m.Confirmed, m.FalsePositive, m.BySeverity["critical"])
@@ -116,6 +116,58 @@ func TestGradeValidation(t *testing.T) {
 	}
 }

+// TestScoreboardFilter: narrowing to repo/PRs drops runs and findings outside
+// the selection, so a model with many extra runs is compared on the same work.
+func TestScoreboardFilter(t *testing.T) {
+	s := testStore(t)
+
+	// fable reviewed only PR 1; veteran reviewed PRs 1 and 2 (and another repo).
+	for _, r := range []Run{
+		{RunID: "f-1", Repo: "steve/x", PR: 1, Model: "fable", Provider: "p", DurationSecs: 60},
+		{RunID: "v-1", Repo: "steve/x", PR: 1, Model: "veteran", Provider: "p", DurationSecs: 120},
+		{RunID: "v-2", Repo: "steve/x", PR: 2, Model: "veteran", Provider: "p", DurationSecs: 120},
+		{RunID: "v-3", Repo: "steve/y", PR: 1, Model: "veteran", Provider: "p", DurationSecs: 120},
+	} {
+		if err := s.AddRun(r); err != nil {
+			t.Fatal(err)
+		}
+	}
+	ids, err := s.AddReports([]ReportIn{
+		{Repo: "steve/x", PR: 1, Lens: "security", File: "a.go", Line: 1, Title: "shared", Model: "fable", Provider: "p", RunID: "f-1"},
+		{Repo: "steve/x", PR: 1, Lens: "security", File: "a.go", Line: 1, Title: "shared", Model: "veteran", Provider: "p", RunID: "v-1"},
+		{Repo: "steve/x", PR: 2, Lens: "security", File: "b.go", Line: 2, Title: "pr2 only", Model: "veteran", Provider: "p", RunID: "v-2"},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := s.AddGrade(Grade{FindingID: ids[0], IsReal: true, Severity: "high"}); err != nil {
+		t.Fatal(err)
+	}
+	if err := s.AddGrade(Grade{FindingID: ids[2], IsReal: true, Severity: "critical"}); err != nil {
+		t.Fatal(err)
+	}
+
+	board, err := s.Scoreboard(ScoreboardFilter{Repo: "steve/x", PRs: []int{1}})
+	if err != nil {
+		t.Fatal(err)
+	}
+	byModel := map[string]ModelStat{}
+	for _, m := range board {
+		byModel[m.Model] = m
+	}
+	v := byModel["veteran"]
+	if v.Runs != 1 || v.Minutes != 2 {
+		t.Errorf("veteran runs=%d minutes=%v, want 1 run / 2 min (PR 2 and steve/y excluded)", v.Runs, v.Minutes)
+	}
+	if v.Findings != 1 || v.Confirmed != 1 || v.BySeverity["critical"] != 0 || v.BySeverity["high"] != 1 {
+		t.Errorf("veteran findings=%d confirmed=%d by_severity=%v, want only the PR-1 finding", v.Findings, v.Confirmed, v.BySeverity)
+	}
+	fbl := byModel["fable"]
+	if fbl.Runs != 1 || fbl.Findings != 1 || fbl.Confirmed != 1 {
+		t.Errorf("fable runs=%d findings=%d confirmed=%d, want 1/1/1", fbl.Runs, fbl.Findings, fbl.Confirmed)
+	}
+}
+
 // TestFindingIDLocationKeyed: id depends on location, not wording; line matters.
 func TestFindingIDLocationKeyed(t *testing.T) {
 	a := findingID("r", 1, "security", "a.go", 10)
@@ -21,6 +21,7 @@
  input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
  input[type=number] { width:64px; }
  input[type=date] { width:140px; }
+  select[multiple] { min-width:200px; }
  input.search { width:220px; }
  button { cursor:pointer; }
  button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
@@ -64,6 +65,7 @@
      <div class="f"><label>from</label><input type="date" id="from"></div>
      <div class="f"><label>to</label><input type="date" id="to"></div>
      <div class="f"><label>repo</label><select id="repo"></select></div>
+      <div class="f"><label>PRs (⌘/ctrl-click for several)</label><select id="pr" multiple size="4" title="Limit the whole comparison to these PRs — every model is scored only on runs/findings from them. The option label shows how many models ran each PR."></select></div>
      <div class="f"><label>provider</label><select id="provider"></select></div>
      <div class="f"><label>model</label><select id="model"></select></div>
      <div class="f"><label>lens</label><select id="lens"></select></div>
@@ -167,7 +169,35 @@ function opt(sel, vals, label){
  for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
  if (vals.includes(cur)) sel.value = cur;
 }
+function prKey(o){ return o.repo + "#" + o.pr; }
+// The PR facet lists every repo#pr with how many models ran it, so it's obvious
+// which PRs are a fair head-to-head (e.g. "steve/x#12 · 5/5 models").
+function buildPRFacet(){
+  const allModels = uniq(RUNS.map(r=>r.model));
+  const byPR = new Map();
+  for (const r of [...RUNS, ...ROWS]){
+    const k = prKey(r);
+    if (!byPR.has(k)) byPR.set(k, new Set());
+    if (r.model) byPR.get(k).add(r.model);
+  }
+  const sel = document.getElementById("pr");
+  const cur = new Set([...sel.selectedOptions].map(o=>o.value));
+  sel.innerHTML = "";
+  const keys = [...byPR.keys()].sort((a,b)=>{
+    const [ra,pa] = splitPR(a), [rb,pb] = splitPR(b);
+    return ra===rb ? pb-pa : ra.localeCompare(rb); // newest PR first within a repo
+  });
+  for (const k of keys){
+    const o = document.createElement("option");
+    o.value = k;
+    o.textContent = `${k} · ${byPR.get(k).size}/${allModels.length} models`;
+    if (cur.has(k)) o.selected = true;
+    sel.appendChild(o);
+  }
+}
+function splitPR(k){ const i = k.lastIndexOf("#"); return [k.slice(0,i), +k.slice(i+1)]; }
 function buildFacets(){
+  buildPRFacet();
  opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
  opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
  opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
@@ -196,6 +226,7 @@ function filters(){
  return {
    from: document.getElementById("from").value,
    to: document.getElementById("to").value,
+    prs: new Set([...document.getElementById("pr").selectedOptions].map(o=>o.value)),
    repo: document.getElementById("repo").value,
    provider: document.getElementById("provider").value,
    model: document.getElementById("model").value,
@@ -205,10 +236,13 @@ function filters(){
  };
 }
 function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
-// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
+// prOK gates a run/row on the PR multi-select: no selection = every PR counts,
+// regardless of which models ran it.
+function prOK(o, f){ return !f.prs.size || f.prs.has(prKey(o)); }
+// run-level filters only (date/repo/provider/model/pr) — severity/lens/search are finding-level.
 function runMatch(r, f){
  return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
-         (!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
+         (!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model) && prOK(r, f);
 }
 function gradeMatch(row, g){
  if (!g) return true;
@@ -225,6 +259,7 @@ function rowMatch(row, f){
  if (f.lens && row.lens!==f.lens) return false;
  if (!gradeMatch(row, f.grade)) return false;
  if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
+  if (!prOK(row, f)) return false;
  return true;
 }

@@ -238,11 +273,12 @@ function aggregate(f){
  const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
    findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };

-  for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
+  const prsSeen = new Set();
+  for (const r of RUNS){ if(!runMatch(r,f)) continue; prsSeen.add(prKey(r)); const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
    m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }

  const rows = ROWS.filter(r => rowMatch(r, f) && !HIDDEN.has(r.model));
-  for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
+  for (const r of rows){ prsSeen.add(prKey(r)); const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
    m.findings.add(r.finding_id);
    if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
    else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
@@ -269,7 +305,7 @@ function aggregate(f){
      ptsPerRun: m.runs>0 ? points/m.runs : null,
      confirmedPct: findings>0 ? confirmed/findings*100 : null };
  }).filter(m => (m.runs>0 || m.findings>0) && !HIDDEN.has(m.model));
-  return { models: out, rows };
+  return { models: out, rows, prsSeen };
 }

 const COLS = [
@@ -286,7 +322,7 @@ const COLS = [

 function render(){
  const f = filters();
-  const { models, rows } = aggregate(f);
+  const { models, rows, prsSeen } = aggregate(f);
  models.sort((a,b)=>{
    let x=a[sortKey], y=b[sortKey];
    if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
@@ -346,8 +382,11 @@ function render(){
  }

  const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
+  const prNote = f.prs.size
+    ? ` · <b>scoped to ${prsSeen.size} PR${prsSeen.size===1?"":"s"}</b>` : "";
  document.getElementById("summary").innerHTML =
    `${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
+    prNote +
    (selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");

  // detail
@@ -373,6 +412,7 @@ function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;
 function resetFilters(){
  for (const id of ["from","to","q"]) document.getElementById(id).value="";
  for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
+  for (const o of document.getElementById("pr").options) o.selected = false;
  selModel = null; render();
 }