Files
gadfly-reports/ui.html
T
Steve Dudenhoeffer dd8ada479e
CI / test (pull_request) Successful in 9m51s
feat(ui): hide/exclude models from the dashboard (persisted)
Each scoreboard row gets a × to hide that model — for retired ones (m1
etc.) you no longer want in the view. Hidden models drop out of the
table, totals, and the findings drill-down; the set persists in
localStorage (grt-hidden) across reloads, with a "hidden (N): …" bar of
click-to-restore chips + a "show all".

Solo-ness is still computed against ALL models (hiding is a view filter,
not a rescoring), so hiding one model never fakes another's solo finds.
README Dashboard section updated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 20:36:24 -04:00

386 lines
20 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>gadfly-reports · model performance</title>
<style>
:root { --bg:#0f1115; --panel:#171a21; --line:#262b36; --fg:#e6e9ef; --mut:#9aa4b2; --acc:#7aa2f7; --good:#6ee7a0; --bad:#f7768e; --warn:#e0af68; }
* { box-sizing:border-box; }
body { margin:0; background:var(--bg); color:var(--fg); font:14px/1.45 system-ui,-apple-system,Segoe UI,Roboto,sans-serif; }
header { display:flex; align-items:center; gap:12px; padding:12px 16px; border-bottom:1px solid var(--line); flex-wrap:wrap; }
h1 { font-size:16px; margin:0; font-weight:600; }
h1 .fly { font-size:18px; }
.mut { color:var(--mut); }
.spacer { flex:1; }
main { padding:16px; }
.panel { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:12px; margin-bottom:14px; }
.row { display:flex; flex-wrap:wrap; gap:10px 14px; align-items:flex-end; }
.f { display:flex; flex-direction:column; gap:3px; }
.f label { font-size:11px; text-transform:uppercase; letter-spacing:.04em; color:var(--mut); }
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
input[type=number] { width:64px; }
input[type=date] { width:140px; }
input.search { width:220px; }
button { cursor:pointer; }
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
button.link { background:none; border:none; color:var(--acc); padding:0; text-decoration:underline; }
table { width:100%; border-collapse:collapse; font-variant-numeric:tabular-nums; }
th, td { text-align:right; padding:6px 9px; border-bottom:1px solid var(--line); white-space:nowrap; }
th:first-child, td:first-child, th.l, td.l { text-align:left; }
th { color:var(--mut); font-weight:600; cursor:pointer; user-select:none; position:sticky; top:0; background:var(--panel); }
th.active::after { content:" ▾"; color:var(--acc); }
th.active.asc::after { content:" ▴"; }
tbody tr:hover { background:#1d212b; }
tr.sel { background:#23304d !important; }
.sev { display:inline-block; min-width:14px; padding:0 4px; border-radius:4px; font-size:11px; }
.pill { font-size:11px; padding:1px 6px; border:1px solid var(--line); border-radius:999px; color:var(--mut); cursor:pointer; }
.good { color:var(--good); } .bad { color:var(--bad); } .warn { color:var(--warn); }
.num { font-variant-numeric:tabular-nums; }
.tok { display:inline-flex; gap:6px; align-items:center; }
#err { color:var(--bad); }
details > summary { cursor:pointer; color:var(--mut); }
.small { font-size:12px; }
code { background:#0c0e12; padding:1px 5px; border-radius:4px; }
</style>
</head>
<body>
<header>
<h1><span class="fly">🪰📋</span> gadfly-reports <span class="mut">· model performance</span></h1>
<span class="spacer"></span>
<span id="status" class="mut small"></span>
<div class="tok" id="tokbox" style="display:none">
<input id="token" type="password" placeholder="store bearer token" size="22">
<button class="primary" onclick="saveToken()">connect</button>
</div>
<button onclick="load()">↻ refresh</button>
</header>
<main>
<div id="err"></div>
<div class="panel">
<div class="row">
<div class="f"><label>from</label><input type="date" id="from"></div>
<div class="f"><label>to</label><input type="date" id="to"></div>
<div class="f"><label>repo</label><select id="repo"></select></div>
<div class="f"><label>provider</label><select id="provider"></select></div>
<div class="f"><label>model</label><select id="model"></select></div>
<div class="f"><label>lens</label><select id="lens"></select></div>
<div class="f"><label>grade / severity</label><select id="grade"></select></div>
<div class="f"><label>search (title/file)</label><input class="search" id="q" placeholder="substring…"></div>
<div class="f"><label>&nbsp;</label><button class="link" onclick="resetFilters()">reset</button></div>
</div>
<div class="row" style="margin-top:10px">
<div class="f" style="flex-direction:row;align-items:center;gap:8px">
<label style="text-transform:none">points curve (client-side):</label>
<span class="small mut">trivial</span><input type="number" id="p_trivial" value="1">
<span class="small mut">small</span><input type="number" id="p_small" value="3">
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
<span class="small mut">high</span><input type="number" id="p_high" value="8">
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
<span class="small mut" style="margin-left:18px">solo-find bonus ×</span><input type="number" id="solo_bonus" value="1.5" step="0.5" min="1" title="A confirmed finding that NO other model reported scores this × its severity points — rewarding a model for catching what the swarm missed. 1 = no bonus.">
<span class="small mut" style="margin-left:18px">solo-error penalty ×</span><input type="number" id="solo_err" value="1.5" step="0.5" min="1" title="A false positive that NO other model made (a unique wrong claim) multiplies its FP penalty by this — noisier than a shared mistake. 1 = no extra penalty.">
</div>
</div>
</div>
<div class="panel">
<div id="summary" class="small mut" style="margin-bottom:8px"></div>
<div id="hidden" class="small mut" style="margin-bottom:8px;display:none"></div>
<table id="models">
<thead><tr id="mhead"></tr></thead>
<tbody id="mbody"></tbody>
</table>
</div>
<div class="panel">
<details id="detwrap">
<summary><span id="detcount">findings</span> — drill down (click a model row above to scope)</summary>
<table style="margin-top:10px">
<thead><tr>
<th class="l">reported</th><th class="l">repo</th><th>pr</th><th class="l">lens</th>
<th class="l">file:line</th><th class="l">title</th><th class="l">model</th>
<th class="l">grade</th><th class="l">by</th>
</tr></thead>
<tbody id="fbody"></tbody>
</table>
</details>
</div>
</main>
<script>
const SEVS = ["trivial","small","medium","high","critical"];
const SEVCOLOR = { trivial:"#3b4252", small:"#2e4d3a", medium:"#4d4a2e", high:"#5a3b2e", critical:"#5a2e3a" };
let RUNS = [], ROWS = [];
let sortKey = "ptsPerMin", sortAsc = false, selModel = null;
// Persistently-excluded models (e.g. retired ones like m1). Hidden from the
// scoreboard, totals, and drill-down; persisted in localStorage across reloads.
// Solo-ness is still computed against ALL models (hiding is a view filter, not a
// rescoring), so hiding one model never fakes another's solo finds.
function loadHidden(){ try { return new Set(JSON.parse(localStorage.getItem("grt-hidden") || "[]")); } catch { return new Set(); } }
let HIDDEN = loadHidden();
function saveHidden(){ localStorage.setItem("grt-hidden", JSON.stringify([...HIDDEN].sort())); }
function hideModel(m){ HIDDEN.add(m); if (selModel===m) selModel=null; saveHidden(); render(); }
function showModel(m){ HIDDEN.delete(m); saveHidden(); render(); }
function showAllModels(){ HIDDEN.clear(); saveHidden(); render(); }
function token(){
const q = new URL(location.href).searchParams.get("token");
if (q) { localStorage.setItem("grt", q); return q; }
return localStorage.getItem("grt") || "";
}
function saveToken(){ localStorage.setItem("grt", document.getElementById("token").value.trim()); load(); }
function needToken(){ document.getElementById("tokbox").style.display = "flex"; }
async function api(path){
const t = token();
const r = await fetch(path, { headers: t ? { "Authorization":"Bearer "+t } : {} });
if (r.status === 401) { needToken(); throw new Error("401 — set a valid token"); }
if (!r.ok) throw new Error(path + " → " + r.status);
return r.json();
}
async function load(){
const err = document.getElementById("err"); err.textContent = "";
document.getElementById("status").textContent = "loading…";
try {
const [runs, rows] = await Promise.all([api("/runs"), api("/export")]);
RUNS = runs || []; ROWS = rows || [];
document.getElementById("tokbox").style.display = "none";
buildFacets(); render();
document.getElementById("status").textContent =
RUNS.length + " runs · " + ROWS.length + " reports";
} catch (e) {
err.textContent = String(e.message || e);
document.getElementById("status").textContent = "";
}
}
function uniq(vals){ return [...new Set(vals.filter(Boolean))].sort(); }
function opt(sel, vals, label){
const cur = sel.value;
sel.innerHTML = "";
const a = document.createElement("option"); a.value = ""; a.textContent = label; sel.appendChild(a);
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
if (vals.includes(cur)) sel.value = cur;
}
function buildFacets(){
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
opt(document.getElementById("lens"), uniq(ROWS.map(r=>r.lens)), "all lenses");
opt(document.getElementById("grade"), ["ungraded","false-positive","confirmed", ...SEVS], "any grade");
}
function curve(){
const c = {};
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
return c;
}
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
function soloBonus(){ const v = parseFloat(document.getElementById("solo_bonus").value); return isNaN(v) ? 1 : v; }
function soloErr(){ const v = parseFloat(document.getElementById("solo_err").value); return isNaN(v) ? 1 : v; }
// A false positive has no graded severity, so penalize it by the severity the
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
// louder the wrong cry, the bigger the penalty.
function rawToSevKey(raw){
const s = (raw||"").toLowerCase();
if (s.includes("blocking")) return "high";
if (s.includes("minor")) return "small";
if (s.includes("no material")) return "trivial";
return "medium"; // unknown / "Reviewed"
}
function filters(){
return {
from: document.getElementById("from").value,
to: document.getElementById("to").value,
repo: document.getElementById("repo").value,
provider: document.getElementById("provider").value,
model: document.getElementById("model").value,
lens: document.getElementById("lens").value,
grade: document.getElementById("grade").value,
q: document.getElementById("q").value.trim().toLowerCase(),
};
}
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
function runMatch(r, f){
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
}
function gradeMatch(row, g){
if (!g) return true;
if (g === "ungraded") return !row.graded;
if (g === "false-positive") return row.graded && row.is_real === false;
if (g === "confirmed") return row.graded && row.is_real === true;
return row.graded && row.is_real === true && row.severity === g; // a specific severity
}
function rowMatch(row, f){
if (!dateOK(row.reported_at, f)) return false;
if (f.repo && row.repo!==f.repo) return false;
if (f.provider && row.provider!==f.provider) return false;
if (f.model && row.model!==f.model) return false;
if (f.lens && row.lens!==f.lens) return false;
if (!gradeMatch(row, f.grade)) return false;
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
return true;
}
function aggregate(f){
const c = curve();
// GLOBAL reporter set per finding (ignores filters) — a finding is "solo" when
// exactly one model ever reported it, so the model filter can't fake solo-ness.
const reporters = new Map();
for (const r of ROWS){ if(!reporters.has(r.finding_id)) reporters.set(r.finding_id, new Set()); reporters.get(r.finding_id).add(r.model); }
const M = new Map();
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
const rows = ROWS.filter(r => rowMatch(r, f) && !HIDDEN.has(r.model));
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
else { m.ungraded.add(r.finding_id); }
}
const fpm = fpMult(), sb = soloBonus(), se = soloErr();
const out = [...M.values()].map(m => {
const sevCounts = Object.fromEntries(SEVS.map(s=>[s,0]));
let confirmedPoints = 0, solo = 0;
for (const [fid, sevv] of m.confirmed){
if (sevCounts[sevv] !== undefined) sevCounts[sevv]++;
const isSolo = (reporters.get(fid)?.size || 1) === 1; // only this model ever reported it
if (isSolo) solo++;
confirmedPoints += (c[sevv] || 0) * (isSolo ? sb : 1);
}
let fpPen = 0; for (const [fid, k] of m.fp){ const soloE = (reporters.get(fid)?.size || 1) === 1; fpPen += (c[k]||0) * fpm * (soloE ? se : 1); } // solo (unique) errors penalized extra
const points = confirmedPoints + fpPen; // NET: solo-boosted confirmed + FP penalty
const findings = m.findings.size, confirmed = m.confirmed.size;
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
inTok:m.inTok, outTok:m.outTok, findings, confirmed, solo, fp:m.fp.size, ungraded:m.ungraded.size,
sev:sevCounts, confirmedPoints, fpPen, points,
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
ptsPerRun: m.runs>0 ? points/m.runs : null,
confirmedPct: findings>0 ? confirmed/findings*100 : null };
}).filter(m => (m.runs>0 || m.findings>0) && !HIDDEN.has(m.model));
return { models: out, rows };
}
const COLS = [
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"solo", t:"solo"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
];
function render(){
const f = filters();
const { models, rows } = aggregate(f);
models.sort((a,b)=>{
let x=a[sortKey], y=b[sortKey];
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
x = x==null?-1:x; y = y==null?-1:y; return sortAsc ? x-y : y-x;
});
// header
const hh = document.getElementById("mhead"); hh.innerHTML = "";
for (const col of COLS){
const th = document.createElement("th"); th.textContent = col.t; if (col.l) th.className="l";
if (col.k===sortKey){ th.classList.add("active"); if(sortAsc) th.classList.add("asc"); }
th.onclick = ()=>{ if(sortKey===col.k) sortAsc=!sortAsc; else { sortKey=col.k; sortAsc=false; } render(); };
hh.appendChild(th);
}
// body
const mb = document.getElementById("mbody"); mb.innerHTML = "";
for (const m of models){
const tr = document.createElement("tr"); if (m.model===selModel) tr.className="sel";
tr.onclick = ()=>{ selModel = (selModel===m.model? null : m.model); render(); };
for (const col of COLS){
const td = document.createElement("td"); if (col.l) td.className="l";
const v = m[col.k];
if (col.k==="model"){
// model name + a hide control (× pill) — injection-safe via JS handler.
td.textContent = (v==null?"—":v) + " ";
const x = document.createElement("span");
x.className = "pill"; x.textContent = "×"; x.title = "hide this model (persists)";
x.onclick = (e)=>{ e.stopPropagation(); hideModel(m.model); };
td.appendChild(x);
} else {
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
}
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
if (col.k==="fpPen" && v<0) td.classList.add("bad");
if (col.k==="solo" && v>0) td.classList.add("good");
if (col.k==="fp" && v>0) td.classList.add("warn");
tr.appendChild(td);
}
mb.appendChild(tr);
}
// hidden-models panel: click a model to restore it
const hid = document.getElementById("hidden");
if (HIDDEN.size){
hid.innerHTML = "";
const lab = document.createElement("span"); lab.textContent = "hidden ("+HIDDEN.size+"): "; hid.appendChild(lab);
for (const m of [...HIDDEN].sort()){
const p = document.createElement("span"); p.className="pill"; p.textContent = " "+m;
p.title = "show this model again"; p.style.marginRight="6px";
p.onclick = ()=> showModel(m);
hid.appendChild(p);
}
const all = document.createElement("button"); all.className="link"; all.textContent="show all";
all.onclick = showAllModels; hid.appendChild(all);
hid.style.display = "";
} else {
hid.style.display = "none"; hid.innerHTML = "";
}
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
document.getElementById("summary").innerHTML =
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
// detail
const det = selModel ? rows.filter(r=>r.model===selModel) : rows;
const fb = document.getElementById("fbody"); fb.innerHTML = "";
const cap = 1000;
for (const r of det.slice(0, cap)){
const tr = document.createElement("tr");
const grade = !r.graded ? '<span class="mut">ungraded</span>'
: (r.is_real ? `<span class="sev" style="background:${SEVCOLOR[r.severity]||'#333'}">${r.severity||'real'}</span>` : '<span class="bad">false-pos</span>');
tr.innerHTML =
`<td class="l mut">${(r.reported_at||"").slice(0,10)}</td><td class="l">${esc(r.repo)}</td><td>${r.pr||""}</td>`+
`<td class="l">${esc(r.lens)}</td><td class="l">${esc(r.file)}${r.line?":"+r.line:""}</td>`+
`<td class="l">${esc(r.title)}</td><td class="l">${esc(r.model)}</td><td class="l">${grade}</td>`+
`<td class="l mut">${esc(r.grader||"")}</td>`;
fb.appendChild(tr);
}
document.getElementById("detcount").textContent =
`${det.length} finding-report${det.length===1?"":"s"}` + (det.length>cap?` (showing ${cap})`:"");
}
function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;","<":"&lt;",">":"&gt;"}[m])); }
function resetFilters(){
for (const id of ["from","to","q"]) document.getElementById(id).value="";
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
selModel = null; render();
}
document.addEventListener("input", e=>{
if (e.target.closest("main")) render();
});
load();
</script>
</body>
</html>