feat: PR filter — compare models on the same set of PRs
Build & push image / build-and-push (push) Successful in 13s
CI / test (push) Successful in 9m51s

UI: a repo#pr multi-select (labeled with how many models ran each PR)
scopes the whole table — runs, minutes, findings, points — to the chosen
PRs, so a model with 2 runs can be fairly compared against one with 60.
API: GET /scoreboard accepts ?repo= and ?pr= (repeatable or comma-list).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 22:55:43 -04:00
parent 2f003dd132
commit 1af115fdf1
6 changed files with 202 additions and 19 deletions
+46 -6
View File
@@ -21,6 +21,7 @@
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
input[type=number] { width:64px; }
input[type=date] { width:140px; }
select[multiple] { min-width:200px; }
input.search { width:220px; }
button { cursor:pointer; }
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
@@ -64,6 +65,7 @@
<div class="f"><label>from</label><input type="date" id="from"></div>
<div class="f"><label>to</label><input type="date" id="to"></div>
<div class="f"><label>repo</label><select id="repo"></select></div>
<div class="f"><label>PRs (⌘/ctrl-click for several)</label><select id="pr" multiple size="4" title="Limit the whole comparison to these PRs — every model is scored only on runs/findings from them. The option label shows how many models ran each PR."></select></div>
<div class="f"><label>provider</label><select id="provider"></select></div>
<div class="f"><label>model</label><select id="model"></select></div>
<div class="f"><label>lens</label><select id="lens"></select></div>
@@ -167,7 +169,35 @@ function opt(sel, vals, label){
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
if (vals.includes(cur)) sel.value = cur;
}
function prKey(o){ return o.repo + "#" + o.pr; }
// The PR facet lists every repo#pr with how many models ran it, so it's obvious
// which PRs are a fair head-to-head (e.g. "steve/x#12 · 5/5 models").
function buildPRFacet(){
const allModels = uniq(RUNS.map(r=>r.model));
const byPR = new Map();
for (const r of [...RUNS, ...ROWS]){
const k = prKey(r);
if (!byPR.has(k)) byPR.set(k, new Set());
if (r.model) byPR.get(k).add(r.model);
}
const sel = document.getElementById("pr");
const cur = new Set([...sel.selectedOptions].map(o=>o.value));
sel.innerHTML = "";
const keys = [...byPR.keys()].sort((a,b)=>{
const [ra,pa] = splitPR(a), [rb,pb] = splitPR(b);
return ra===rb ? pb-pa : ra.localeCompare(rb); // newest PR first within a repo
});
for (const k of keys){
const o = document.createElement("option");
o.value = k;
o.textContent = `${k} · ${byPR.get(k).size}/${allModels.length} models`;
if (cur.has(k)) o.selected = true;
sel.appendChild(o);
}
}
function splitPR(k){ const i = k.lastIndexOf("#"); return [k.slice(0,i), +k.slice(i+1)]; }
function buildFacets(){
buildPRFacet();
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
@@ -196,6 +226,7 @@ function filters(){
return {
from: document.getElementById("from").value,
to: document.getElementById("to").value,
prs: new Set([...document.getElementById("pr").selectedOptions].map(o=>o.value)),
repo: document.getElementById("repo").value,
provider: document.getElementById("provider").value,
model: document.getElementById("model").value,
@@ -205,10 +236,13 @@ function filters(){
};
}
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
// prOK gates a run/row on the PR multi-select: no selection = every PR counts,
// regardless of which models ran it.
function prOK(o, f){ return !f.prs.size || f.prs.has(prKey(o)); }
// run-level filters only (date/repo/provider/model/pr) — severity/lens/search are finding-level.
function runMatch(r, f){
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model) && prOK(r, f);
}
function gradeMatch(row, g){
if (!g) return true;
@@ -225,6 +259,7 @@ function rowMatch(row, f){
if (f.lens && row.lens!==f.lens) return false;
if (!gradeMatch(row, f.grade)) return false;
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
if (!prOK(row, f)) return false;
return true;
}
@@ -238,11 +273,12 @@ function aggregate(f){
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
const prsSeen = new Set();
for (const r of RUNS){ if(!runMatch(r,f)) continue; prsSeen.add(prKey(r)); const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
const rows = ROWS.filter(r => rowMatch(r, f) && !HIDDEN.has(r.model));
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
for (const r of rows){ prsSeen.add(prKey(r)); const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
@@ -269,7 +305,7 @@ function aggregate(f){
ptsPerRun: m.runs>0 ? points/m.runs : null,
confirmedPct: findings>0 ? confirmed/findings*100 : null };
}).filter(m => (m.runs>0 || m.findings>0) && !HIDDEN.has(m.model));
return { models: out, rows };
return { models: out, rows, prsSeen };
}
const COLS = [
@@ -286,7 +322,7 @@ const COLS = [
function render(){
const f = filters();
const { models, rows } = aggregate(f);
const { models, rows, prsSeen } = aggregate(f);
models.sort((a,b)=>{
let x=a[sortKey], y=b[sortKey];
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
@@ -346,8 +382,11 @@ function render(){
}
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
const prNote = f.prs.size
? ` · <b>scoped to ${prsSeen.size} PR${prsSeen.size===1?"":"s"}</b>` : "";
document.getElementById("summary").innerHTML =
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
prNote +
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
// detail
@@ -373,6 +412,7 @@ function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;
function resetFilters(){
for (const id of ["from","to","q"]) document.getElementById(id).value="";
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
for (const o of document.getElementById("pr").options) o.selected = false;
selModel = null; render();
}