Files
gadfly-reports/ui.html
T
steve 1af115fdf1
Build & push image / build-and-push (push) Successful in 13s
CI / test (push) Successful in 9m51s
feat: PR filter — compare models on the same set of PRs
UI: a repo#pr multi-select (labeled with how many models ran each PR)
scopes the whole table — runs, minutes, findings, points — to the chosen
PRs, so a model with 2 runs can be fairly compared against one with 60.
API: GET /scoreboard accepts ?repo= and ?pr= (repeatable or comma-list).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 22:56:49 -04:00

426 lines
22 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>gadfly-reports · model performance</title>
<style>
:root { --bg:#0f1115; --panel:#171a21; --line:#262b36; --fg:#e6e9ef; --mut:#9aa4b2; --acc:#7aa2f7; --good:#6ee7a0; --bad:#f7768e; --warn:#e0af68; }
* { box-sizing:border-box; }
body { margin:0; background:var(--bg); color:var(--fg); font:14px/1.45 system-ui,-apple-system,Segoe UI,Roboto,sans-serif; }
header { display:flex; align-items:center; gap:12px; padding:12px 16px; border-bottom:1px solid var(--line); flex-wrap:wrap; }
h1 { font-size:16px; margin:0; font-weight:600; }
h1 .fly { font-size:18px; }
.mut { color:var(--mut); }
.spacer { flex:1; }
main { padding:16px; }
.panel { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:12px; margin-bottom:14px; }
.row { display:flex; flex-wrap:wrap; gap:10px 14px; align-items:flex-end; }
.f { display:flex; flex-direction:column; gap:3px; }
.f label { font-size:11px; text-transform:uppercase; letter-spacing:.04em; color:var(--mut); }
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
input[type=number] { width:64px; }
input[type=date] { width:140px; }
select[multiple] { min-width:200px; }
input.search { width:220px; }
button { cursor:pointer; }
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
button.link { background:none; border:none; color:var(--acc); padding:0; text-decoration:underline; }
table { width:100%; border-collapse:collapse; font-variant-numeric:tabular-nums; }
th, td { text-align:right; padding:6px 9px; border-bottom:1px solid var(--line); white-space:nowrap; }
th:first-child, td:first-child, th.l, td.l { text-align:left; }
th { color:var(--mut); font-weight:600; cursor:pointer; user-select:none; position:sticky; top:0; background:var(--panel); }
th.active::after { content:" ▾"; color:var(--acc); }
th.active.asc::after { content:" ▴"; }
tbody tr:hover { background:#1d212b; }
tr.sel { background:#23304d !important; }
.sev { display:inline-block; min-width:14px; padding:0 4px; border-radius:4px; font-size:11px; }
.pill { font-size:11px; padding:1px 6px; border:1px solid var(--line); border-radius:999px; color:var(--mut); cursor:pointer; }
.good { color:var(--good); } .bad { color:var(--bad); } .warn { color:var(--warn); }
.num { font-variant-numeric:tabular-nums; }
.tok { display:inline-flex; gap:6px; align-items:center; }
#err { color:var(--bad); }
details > summary { cursor:pointer; color:var(--mut); }
.small { font-size:12px; }
code { background:#0c0e12; padding:1px 5px; border-radius:4px; }
</style>
</head>
<body>
<header>
<h1><span class="fly">🪰📋</span> gadfly-reports <span class="mut">· model performance</span></h1>
<span class="spacer"></span>
<span id="status" class="mut small"></span>
<div class="tok" id="tokbox" style="display:none">
<input id="token" type="password" placeholder="store bearer token" size="22">
<button class="primary" onclick="saveToken()">connect</button>
</div>
<button onclick="load()">↻ refresh</button>
</header>
<main>
<div id="err"></div>
<div class="panel">
<div class="row">
<div class="f"><label>from</label><input type="date" id="from"></div>
<div class="f"><label>to</label><input type="date" id="to"></div>
<div class="f"><label>repo</label><select id="repo"></select></div>
<div class="f"><label>PRs (⌘/ctrl-click for several)</label><select id="pr" multiple size="4" title="Limit the whole comparison to these PRs — every model is scored only on runs/findings from them. The option label shows how many models ran each PR."></select></div>
<div class="f"><label>provider</label><select id="provider"></select></div>
<div class="f"><label>model</label><select id="model"></select></div>
<div class="f"><label>lens</label><select id="lens"></select></div>
<div class="f"><label>grade / severity</label><select id="grade"></select></div>
<div class="f"><label>search (title/file)</label><input class="search" id="q" placeholder="substring…"></div>
<div class="f"><label>&nbsp;</label><button class="link" onclick="resetFilters()">reset</button></div>
</div>
<div class="row" style="margin-top:10px">
<div class="f" style="flex-direction:row;align-items:center;gap:8px">
<label style="text-transform:none">points curve (client-side):</label>
<span class="small mut">trivial</span><input type="number" id="p_trivial" value="1">
<span class="small mut">small</span><input type="number" id="p_small" value="3">
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
<span class="small mut">high</span><input type="number" id="p_high" value="8">
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
<span class="small mut" style="margin-left:18px">solo-find bonus ×</span><input type="number" id="solo_bonus" value="1.5" step="0.5" min="1" title="A confirmed finding that NO other model reported scores this × its severity points — rewarding a model for catching what the swarm missed. 1 = no bonus.">
<span class="small mut" style="margin-left:18px">solo-error penalty ×</span><input type="number" id="solo_err" value="1.5" step="0.5" min="1" title="A false positive that NO other model made (a unique wrong claim) multiplies its FP penalty by this — noisier than a shared mistake. 1 = no extra penalty.">
</div>
</div>
</div>
<div class="panel">
<div id="summary" class="small mut" style="margin-bottom:8px"></div>
<div id="hidden" class="small mut" style="margin-bottom:8px;display:none"></div>
<table id="models">
<thead><tr id="mhead"></tr></thead>
<tbody id="mbody"></tbody>
</table>
</div>
<div class="panel">
<details id="detwrap">
<summary><span id="detcount">findings</span> — drill down (click a model row above to scope)</summary>
<table style="margin-top:10px">
<thead><tr>
<th class="l">reported</th><th class="l">repo</th><th>pr</th><th class="l">lens</th>
<th class="l">file:line</th><th class="l">title</th><th class="l">model</th>
<th class="l">grade</th><th class="l">by</th>
</tr></thead>
<tbody id="fbody"></tbody>
</table>
</details>
</div>
</main>
<script>
const SEVS = ["trivial","small","medium","high","critical"];
const SEVCOLOR = { trivial:"#3b4252", small:"#2e4d3a", medium:"#4d4a2e", high:"#5a3b2e", critical:"#5a2e3a" };
let RUNS = [], ROWS = [];
let sortKey = "ptsPerMin", sortAsc = false, selModel = null;
// Persistently-excluded models (e.g. retired ones like m1). Hidden from the
// scoreboard, totals, and drill-down; persisted in localStorage across reloads.
// Solo-ness is still computed against ALL models (hiding is a view filter, not a
// rescoring), so hiding one model never fakes another's solo finds.
function loadHidden(){ try { return new Set(JSON.parse(localStorage.getItem("grt-hidden") || "[]")); } catch { return new Set(); } }
let HIDDEN = loadHidden();
function saveHidden(){ localStorage.setItem("grt-hidden", JSON.stringify([...HIDDEN].sort())); }
function hideModel(m){ HIDDEN.add(m); if (selModel===m) selModel=null; saveHidden(); render(); }
function showModel(m){ HIDDEN.delete(m); saveHidden(); render(); }
function showAllModels(){ HIDDEN.clear(); saveHidden(); render(); }
function token(){
const q = new URL(location.href).searchParams.get("token");
if (q) { localStorage.setItem("grt", q); return q; }
return localStorage.getItem("grt") || "";
}
function saveToken(){ localStorage.setItem("grt", document.getElementById("token").value.trim()); load(); }
function needToken(){ document.getElementById("tokbox").style.display = "flex"; }
async function api(path){
const t = token();
const r = await fetch(path, { headers: t ? { "Authorization":"Bearer "+t } : {} });
if (r.status === 401) { needToken(); throw new Error("401 — set a valid token"); }
if (!r.ok) throw new Error(path + " → " + r.status);
return r.json();
}
async function load(){
const err = document.getElementById("err"); err.textContent = "";
document.getElementById("status").textContent = "loading…";
try {
const [runs, rows] = await Promise.all([api("/runs"), api("/export")]);
RUNS = runs || []; ROWS = rows || [];
document.getElementById("tokbox").style.display = "none";
buildFacets(); render();
document.getElementById("status").textContent =
RUNS.length + " runs · " + ROWS.length + " reports";
} catch (e) {
err.textContent = String(e.message || e);
document.getElementById("status").textContent = "";
}
}
function uniq(vals){ return [...new Set(vals.filter(Boolean))].sort(); }
function opt(sel, vals, label){
const cur = sel.value;
sel.innerHTML = "";
const a = document.createElement("option"); a.value = ""; a.textContent = label; sel.appendChild(a);
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
if (vals.includes(cur)) sel.value = cur;
}
function prKey(o){ return o.repo + "#" + o.pr; }
// The PR facet lists every repo#pr with how many models ran it, so it's obvious
// which PRs are a fair head-to-head (e.g. "steve/x#12 · 5/5 models").
function buildPRFacet(){
const allModels = uniq(RUNS.map(r=>r.model));
const byPR = new Map();
for (const r of [...RUNS, ...ROWS]){
const k = prKey(r);
if (!byPR.has(k)) byPR.set(k, new Set());
if (r.model) byPR.get(k).add(r.model);
}
const sel = document.getElementById("pr");
const cur = new Set([...sel.selectedOptions].map(o=>o.value));
sel.innerHTML = "";
const keys = [...byPR.keys()].sort((a,b)=>{
const [ra,pa] = splitPR(a), [rb,pb] = splitPR(b);
return ra===rb ? pb-pa : ra.localeCompare(rb); // newest PR first within a repo
});
for (const k of keys){
const o = document.createElement("option");
o.value = k;
o.textContent = `${k} · ${byPR.get(k).size}/${allModels.length} models`;
if (cur.has(k)) o.selected = true;
sel.appendChild(o);
}
}
function splitPR(k){ const i = k.lastIndexOf("#"); return [k.slice(0,i), +k.slice(i+1)]; }
function buildFacets(){
buildPRFacet();
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
opt(document.getElementById("lens"), uniq(ROWS.map(r=>r.lens)), "all lenses");
opt(document.getElementById("grade"), ["ungraded","false-positive","confirmed", ...SEVS], "any grade");
}
function curve(){
const c = {};
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
return c;
}
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
function soloBonus(){ const v = parseFloat(document.getElementById("solo_bonus").value); return isNaN(v) ? 1 : v; }
function soloErr(){ const v = parseFloat(document.getElementById("solo_err").value); return isNaN(v) ? 1 : v; }
// A false positive has no graded severity, so penalize it by the severity the
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
// louder the wrong cry, the bigger the penalty.
function rawToSevKey(raw){
const s = (raw||"").toLowerCase();
if (s.includes("blocking")) return "high";
if (s.includes("minor")) return "small";
if (s.includes("no material")) return "trivial";
return "medium"; // unknown / "Reviewed"
}
function filters(){
return {
from: document.getElementById("from").value,
to: document.getElementById("to").value,
prs: new Set([...document.getElementById("pr").selectedOptions].map(o=>o.value)),
repo: document.getElementById("repo").value,
provider: document.getElementById("provider").value,
model: document.getElementById("model").value,
lens: document.getElementById("lens").value,
grade: document.getElementById("grade").value,
q: document.getElementById("q").value.trim().toLowerCase(),
};
}
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
// prOK gates a run/row on the PR multi-select: no selection = every PR counts,
// regardless of which models ran it.
function prOK(o, f){ return !f.prs.size || f.prs.has(prKey(o)); }
// run-level filters only (date/repo/provider/model/pr) — severity/lens/search are finding-level.
function runMatch(r, f){
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model) && prOK(r, f);
}
function gradeMatch(row, g){
if (!g) return true;
if (g === "ungraded") return !row.graded;
if (g === "false-positive") return row.graded && row.is_real === false;
if (g === "confirmed") return row.graded && row.is_real === true;
return row.graded && row.is_real === true && row.severity === g; // a specific severity
}
function rowMatch(row, f){
if (!dateOK(row.reported_at, f)) return false;
if (f.repo && row.repo!==f.repo) return false;
if (f.provider && row.provider!==f.provider) return false;
if (f.model && row.model!==f.model) return false;
if (f.lens && row.lens!==f.lens) return false;
if (!gradeMatch(row, f.grade)) return false;
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
if (!prOK(row, f)) return false;
return true;
}
function aggregate(f){
const c = curve();
// GLOBAL reporter set per finding (ignores filters) — a finding is "solo" when
// exactly one model ever reported it, so the model filter can't fake solo-ness.
const reporters = new Map();
for (const r of ROWS){ if(!reporters.has(r.finding_id)) reporters.set(r.finding_id, new Set()); reporters.get(r.finding_id).add(r.model); }
const M = new Map();
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
const prsSeen = new Set();
for (const r of RUNS){ if(!runMatch(r,f)) continue; prsSeen.add(prKey(r)); const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
const rows = ROWS.filter(r => rowMatch(r, f) && !HIDDEN.has(r.model));
for (const r of rows){ prsSeen.add(prKey(r)); const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
else { m.ungraded.add(r.finding_id); }
}
const fpm = fpMult(), sb = soloBonus(), se = soloErr();
const out = [...M.values()].map(m => {
const sevCounts = Object.fromEntries(SEVS.map(s=>[s,0]));
let confirmedPoints = 0, solo = 0;
for (const [fid, sevv] of m.confirmed){
if (sevCounts[sevv] !== undefined) sevCounts[sevv]++;
const isSolo = (reporters.get(fid)?.size || 1) === 1; // only this model ever reported it
if (isSolo) solo++;
confirmedPoints += (c[sevv] || 0) * (isSolo ? sb : 1);
}
let fpPen = 0; for (const [fid, k] of m.fp){ const soloE = (reporters.get(fid)?.size || 1) === 1; fpPen += (c[k]||0) * fpm * (soloE ? se : 1); } // solo (unique) errors penalized extra
const points = confirmedPoints + fpPen; // NET: solo-boosted confirmed + FP penalty
const findings = m.findings.size, confirmed = m.confirmed.size;
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
inTok:m.inTok, outTok:m.outTok, findings, confirmed, solo, fp:m.fp.size, ungraded:m.ungraded.size,
sev:sevCounts, confirmedPoints, fpPen, points,
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
ptsPerRun: m.runs>0 ? points/m.runs : null,
confirmedPct: findings>0 ? confirmed/findings*100 : null };
}).filter(m => (m.runs>0 || m.findings>0) && !HIDDEN.has(m.model));
return { models: out, rows, prsSeen };
}
const COLS = [
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"solo", t:"solo"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
];
function render(){
const f = filters();
const { models, rows, prsSeen } = aggregate(f);
models.sort((a,b)=>{
let x=a[sortKey], y=b[sortKey];
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
x = x==null?-1:x; y = y==null?-1:y; return sortAsc ? x-y : y-x;
});
// header
const hh = document.getElementById("mhead"); hh.innerHTML = "";
for (const col of COLS){
const th = document.createElement("th"); th.textContent = col.t; if (col.l) th.className="l";
if (col.k===sortKey){ th.classList.add("active"); if(sortAsc) th.classList.add("asc"); }
th.onclick = ()=>{ if(sortKey===col.k) sortAsc=!sortAsc; else { sortKey=col.k; sortAsc=false; } render(); };
hh.appendChild(th);
}
// body
const mb = document.getElementById("mbody"); mb.innerHTML = "";
for (const m of models){
const tr = document.createElement("tr"); if (m.model===selModel) tr.className="sel";
tr.onclick = ()=>{ selModel = (selModel===m.model? null : m.model); render(); };
for (const col of COLS){
const td = document.createElement("td"); if (col.l) td.className="l";
const v = m[col.k];
if (col.k==="model"){
// model name + a hide control (× pill) — injection-safe via JS handler.
td.textContent = (v==null?"—":v) + " ";
const x = document.createElement("span");
x.className = "pill"; x.textContent = "×"; x.title = "hide this model (persists)";
x.onclick = (e)=>{ e.stopPropagation(); hideModel(m.model); };
td.appendChild(x);
} else {
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
}
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
if (col.k==="fpPen" && v<0) td.classList.add("bad");
if (col.k==="solo" && v>0) td.classList.add("good");
if (col.k==="fp" && v>0) td.classList.add("warn");
tr.appendChild(td);
}
mb.appendChild(tr);
}
// hidden-models panel: click a model to restore it
const hid = document.getElementById("hidden");
if (HIDDEN.size){
hid.innerHTML = "";
const lab = document.createElement("span"); lab.textContent = "hidden ("+HIDDEN.size+"): "; hid.appendChild(lab);
for (const m of [...HIDDEN].sort()){
const p = document.createElement("span"); p.className="pill"; p.textContent = " "+m;
p.title = "show this model again"; p.style.marginRight="6px";
p.onclick = ()=> showModel(m);
hid.appendChild(p);
}
const all = document.createElement("button"); all.className="link"; all.textContent="show all";
all.onclick = showAllModels; hid.appendChild(all);
hid.style.display = "";
} else {
hid.style.display = "none"; hid.innerHTML = "";
}
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
const prNote = f.prs.size
? ` · <b>scoped to ${prsSeen.size} PR${prsSeen.size===1?"":"s"}</b>` : "";
document.getElementById("summary").innerHTML =
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
prNote +
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
// detail
const det = selModel ? rows.filter(r=>r.model===selModel) : rows;
const fb = document.getElementById("fbody"); fb.innerHTML = "";
const cap = 1000;
for (const r of det.slice(0, cap)){
const tr = document.createElement("tr");
const grade = !r.graded ? '<span class="mut">ungraded</span>'
: (r.is_real ? `<span class="sev" style="background:${SEVCOLOR[r.severity]||'#333'}">${r.severity||'real'}</span>` : '<span class="bad">false-pos</span>');
tr.innerHTML =
`<td class="l mut">${(r.reported_at||"").slice(0,10)}</td><td class="l">${esc(r.repo)}</td><td>${r.pr||""}</td>`+
`<td class="l">${esc(r.lens)}</td><td class="l">${esc(r.file)}${r.line?":"+r.line:""}</td>`+
`<td class="l">${esc(r.title)}</td><td class="l">${esc(r.model)}</td><td class="l">${grade}</td>`+
`<td class="l mut">${esc(r.grader||"")}</td>`;
fb.appendChild(tr);
}
document.getElementById("detcount").textContent =
`${det.length} finding-report${det.length===1?"":"s"}` + (det.length>cap?` (showing ${cap})`:"");
}
function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;","<":"&lt;",">":"&gt;"}[m])); }
function resetFilters(){
for (const id of ["from","to","q"]) document.getElementById(id).value="";
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
for (const o of document.getElementById("pr").options) o.selected = false;
selModel = null; render();
}
document.addEventListener("input", e=>{
if (e.target.closest("main")) render();
});
load();
</script>
</body>
</html>