0cb6b25f11
Adds an editable 'false-positive penalty ×' to the dashboard. A false positive carries no graded severity, so it's penalized by the severity the model CLAIMED (its lens verdict / raw_severity, mapped onto the curve: Blocking->high, Minor->small). points(net) = confirmed points + Σ penalty×points[claimed], so a model with a few good finds but many false positives nets down — even negative — and sorts to the bottom. Adds an 'fp pen' column; net points/pts-min/pts-run shown red when negative. Client-side only; the store stays point-free. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
332 lines
17 KiB
HTML
332 lines
17 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
<title>gadfly-reports · model performance</title>
|
||
<style>
|
||
:root { --bg:#0f1115; --panel:#171a21; --line:#262b36; --fg:#e6e9ef; --mut:#9aa4b2; --acc:#7aa2f7; --good:#6ee7a0; --bad:#f7768e; --warn:#e0af68; }
|
||
* { box-sizing:border-box; }
|
||
body { margin:0; background:var(--bg); color:var(--fg); font:14px/1.45 system-ui,-apple-system,Segoe UI,Roboto,sans-serif; }
|
||
header { display:flex; align-items:center; gap:12px; padding:12px 16px; border-bottom:1px solid var(--line); flex-wrap:wrap; }
|
||
h1 { font-size:16px; margin:0; font-weight:600; }
|
||
h1 .fly { font-size:18px; }
|
||
.mut { color:var(--mut); }
|
||
.spacer { flex:1; }
|
||
main { padding:16px; }
|
||
.panel { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:12px; margin-bottom:14px; }
|
||
.row { display:flex; flex-wrap:wrap; gap:10px 14px; align-items:flex-end; }
|
||
.f { display:flex; flex-direction:column; gap:3px; }
|
||
.f label { font-size:11px; text-transform:uppercase; letter-spacing:.04em; color:var(--mut); }
|
||
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
|
||
input[type=number] { width:64px; }
|
||
input[type=date] { width:140px; }
|
||
input.search { width:220px; }
|
||
button { cursor:pointer; }
|
||
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
|
||
button.link { background:none; border:none; color:var(--acc); padding:0; text-decoration:underline; }
|
||
table { width:100%; border-collapse:collapse; font-variant-numeric:tabular-nums; }
|
||
th, td { text-align:right; padding:6px 9px; border-bottom:1px solid var(--line); white-space:nowrap; }
|
||
th:first-child, td:first-child, th.l, td.l { text-align:left; }
|
||
th { color:var(--mut); font-weight:600; cursor:pointer; user-select:none; position:sticky; top:0; background:var(--panel); }
|
||
th.active::after { content:" ▾"; color:var(--acc); }
|
||
th.active.asc::after { content:" ▴"; }
|
||
tbody tr:hover { background:#1d212b; }
|
||
tr.sel { background:#23304d !important; }
|
||
.sev { display:inline-block; min-width:14px; padding:0 4px; border-radius:4px; font-size:11px; }
|
||
.pill { font-size:11px; padding:1px 6px; border:1px solid var(--line); border-radius:999px; color:var(--mut); cursor:pointer; }
|
||
.good { color:var(--good); } .bad { color:var(--bad); } .warn { color:var(--warn); }
|
||
.num { font-variant-numeric:tabular-nums; }
|
||
.tok { display:inline-flex; gap:6px; align-items:center; }
|
||
#err { color:var(--bad); }
|
||
details > summary { cursor:pointer; color:var(--mut); }
|
||
.small { font-size:12px; }
|
||
code { background:#0c0e12; padding:1px 5px; border-radius:4px; }
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<header>
|
||
<h1><span class="fly">🪰📋</span> gadfly-reports <span class="mut">· model performance</span></h1>
|
||
<span class="spacer"></span>
|
||
<span id="status" class="mut small"></span>
|
||
<div class="tok" id="tokbox" style="display:none">
|
||
<input id="token" type="password" placeholder="store bearer token" size="22">
|
||
<button class="primary" onclick="saveToken()">connect</button>
|
||
</div>
|
||
<button onclick="load()">↻ refresh</button>
|
||
</header>
|
||
|
||
<main>
|
||
<div id="err"></div>
|
||
|
||
<div class="panel">
|
||
<div class="row">
|
||
<div class="f"><label>from</label><input type="date" id="from"></div>
|
||
<div class="f"><label>to</label><input type="date" id="to"></div>
|
||
<div class="f"><label>repo</label><select id="repo"></select></div>
|
||
<div class="f"><label>provider</label><select id="provider"></select></div>
|
||
<div class="f"><label>model</label><select id="model"></select></div>
|
||
<div class="f"><label>lens</label><select id="lens"></select></div>
|
||
<div class="f"><label>grade / severity</label><select id="grade"></select></div>
|
||
<div class="f"><label>search (title/file)</label><input class="search" id="q" placeholder="substring…"></div>
|
||
<div class="f"><label> </label><button class="link" onclick="resetFilters()">reset</button></div>
|
||
</div>
|
||
<div class="row" style="margin-top:10px">
|
||
<div class="f" style="flex-direction:row;align-items:center;gap:8px">
|
||
<label style="text-transform:none">points curve (client-side):</label>
|
||
<span class="small mut">trivial</span><input type="number" id="p_trivial" value="1">
|
||
<span class="small mut">small</span><input type="number" id="p_small" value="3">
|
||
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
|
||
<span class="small mut">high</span><input type="number" id="p_high" value="8">
|
||
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
|
||
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="panel">
|
||
<div id="summary" class="small mut" style="margin-bottom:8px"></div>
|
||
<table id="models">
|
||
<thead><tr id="mhead"></tr></thead>
|
||
<tbody id="mbody"></tbody>
|
||
</table>
|
||
</div>
|
||
|
||
<div class="panel">
|
||
<details id="detwrap">
|
||
<summary><span id="detcount">findings</span> — drill down (click a model row above to scope)</summary>
|
||
<table style="margin-top:10px">
|
||
<thead><tr>
|
||
<th class="l">reported</th><th class="l">repo</th><th>pr</th><th class="l">lens</th>
|
||
<th class="l">file:line</th><th class="l">title</th><th class="l">model</th>
|
||
<th class="l">grade</th><th class="l">by</th>
|
||
</tr></thead>
|
||
<tbody id="fbody"></tbody>
|
||
</table>
|
||
</details>
|
||
</div>
|
||
</main>
|
||
|
||
<script>
|
||
const SEVS = ["trivial","small","medium","high","critical"];
|
||
const SEVCOLOR = { trivial:"#3b4252", small:"#2e4d3a", medium:"#4d4a2e", high:"#5a3b2e", critical:"#5a2e3a" };
|
||
let RUNS = [], ROWS = [];
|
||
let sortKey = "ptsPerMin", sortAsc = false, selModel = null;
|
||
|
||
function token(){
|
||
const q = new URL(location.href).searchParams.get("token");
|
||
if (q) { localStorage.setItem("grt", q); return q; }
|
||
return localStorage.getItem("grt") || "";
|
||
}
|
||
function saveToken(){ localStorage.setItem("grt", document.getElementById("token").value.trim()); load(); }
|
||
function needToken(){ document.getElementById("tokbox").style.display = "flex"; }
|
||
|
||
async function api(path){
|
||
const t = token();
|
||
const r = await fetch(path, { headers: t ? { "Authorization":"Bearer "+t } : {} });
|
||
if (r.status === 401) { needToken(); throw new Error("401 — set a valid token"); }
|
||
if (!r.ok) throw new Error(path + " → " + r.status);
|
||
return r.json();
|
||
}
|
||
|
||
async function load(){
|
||
const err = document.getElementById("err"); err.textContent = "";
|
||
document.getElementById("status").textContent = "loading…";
|
||
try {
|
||
const [runs, rows] = await Promise.all([api("/runs"), api("/export")]);
|
||
RUNS = runs || []; ROWS = rows || [];
|
||
document.getElementById("tokbox").style.display = "none";
|
||
buildFacets(); render();
|
||
document.getElementById("status").textContent =
|
||
RUNS.length + " runs · " + ROWS.length + " reports";
|
||
} catch (e) {
|
||
err.textContent = String(e.message || e);
|
||
document.getElementById("status").textContent = "";
|
||
}
|
||
}
|
||
|
||
function uniq(vals){ return [...new Set(vals.filter(Boolean))].sort(); }
|
||
function opt(sel, vals, label){
|
||
const cur = sel.value;
|
||
sel.innerHTML = "";
|
||
const a = document.createElement("option"); a.value = ""; a.textContent = label; sel.appendChild(a);
|
||
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
|
||
if (vals.includes(cur)) sel.value = cur;
|
||
}
|
||
function buildFacets(){
|
||
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
|
||
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
|
||
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
|
||
opt(document.getElementById("lens"), uniq(ROWS.map(r=>r.lens)), "all lenses");
|
||
opt(document.getElementById("grade"), ["ungraded","false-positive","confirmed", ...SEVS], "any grade");
|
||
}
|
||
function curve(){
|
||
const c = {};
|
||
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
|
||
return c;
|
||
}
|
||
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
|
||
// A false positive has no graded severity, so penalize it by the severity the
|
||
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
|
||
// louder the wrong cry, the bigger the penalty.
|
||
function rawToSevKey(raw){
|
||
const s = (raw||"").toLowerCase();
|
||
if (s.includes("blocking")) return "high";
|
||
if (s.includes("minor")) return "small";
|
||
if (s.includes("no material")) return "trivial";
|
||
return "medium"; // unknown / "Reviewed"
|
||
}
|
||
function filters(){
|
||
return {
|
||
from: document.getElementById("from").value,
|
||
to: document.getElementById("to").value,
|
||
repo: document.getElementById("repo").value,
|
||
provider: document.getElementById("provider").value,
|
||
model: document.getElementById("model").value,
|
||
lens: document.getElementById("lens").value,
|
||
grade: document.getElementById("grade").value,
|
||
q: document.getElementById("q").value.trim().toLowerCase(),
|
||
};
|
||
}
|
||
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
|
||
// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
|
||
function runMatch(r, f){
|
||
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
|
||
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
|
||
}
|
||
function gradeMatch(row, g){
|
||
if (!g) return true;
|
||
if (g === "ungraded") return !row.graded;
|
||
if (g === "false-positive") return row.graded && row.is_real === false;
|
||
if (g === "confirmed") return row.graded && row.is_real === true;
|
||
return row.graded && row.is_real === true && row.severity === g; // a specific severity
|
||
}
|
||
function rowMatch(row, f){
|
||
if (!dateOK(row.reported_at, f)) return false;
|
||
if (f.repo && row.repo!==f.repo) return false;
|
||
if (f.provider && row.provider!==f.provider) return false;
|
||
if (f.model && row.model!==f.model) return false;
|
||
if (f.lens && row.lens!==f.lens) return false;
|
||
if (!gradeMatch(row, f.grade)) return false;
|
||
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
|
||
return true;
|
||
}
|
||
|
||
function aggregate(f){
|
||
const c = curve();
|
||
const M = new Map();
|
||
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
|
||
findings:new Set(), confirmed:new Set(), fp:new Map(), ungraded:new Set(), sev:Object.fromEntries(SEVS.map(s=>[s,new Set()]))}); return M.get(m); };
|
||
|
||
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
|
||
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
|
||
|
||
const rows = ROWS.filter(r => rowMatch(r, f));
|
||
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
|
||
m.findings.add(r.finding_id);
|
||
if (r.graded && r.is_real === true){ m.confirmed.add(r.finding_id); if (r.severity) m.sev[r.severity].add(r.finding_id); }
|
||
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
|
||
else { m.ungraded.add(r.finding_id); }
|
||
}
|
||
|
||
const fpm = fpMult();
|
||
const out = [...M.values()].map(m => {
|
||
const sevCounts = Object.fromEntries(SEVS.map(s=>[s, m.sev[s].size]));
|
||
const confirmedPoints = SEVS.reduce((a,s)=> a + c[s]*sevCounts[s], 0);
|
||
let fpPen = 0; for (const k of m.fp.values()) fpPen += (c[k]||0) * fpm; // negative when fpm<0
|
||
const points = confirmedPoints + fpPen; // NET of the false-positive penalty
|
||
const findings = m.findings.size, confirmed = m.confirmed.size;
|
||
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
|
||
inTok:m.inTok, outTok:m.outTok, findings, confirmed, fp:m.fp.size, ungraded:m.ungraded.size,
|
||
sev:sevCounts, confirmedPoints, fpPen, points,
|
||
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
|
||
ptsPerRun: m.runs>0 ? points/m.runs : null,
|
||
confirmedPct: findings>0 ? confirmed/findings*100 : null };
|
||
}).filter(m => m.runs>0 || m.findings>0);
|
||
return { models: out, rows };
|
||
}
|
||
|
||
const COLS = [
|
||
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
|
||
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
|
||
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
|
||
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
|
||
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
|
||
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
|
||
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
|
||
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
|
||
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
|
||
];
|
||
|
||
function render(){
|
||
const f = filters();
|
||
const { models, rows } = aggregate(f);
|
||
models.sort((a,b)=>{
|
||
let x=a[sortKey], y=b[sortKey];
|
||
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
|
||
x = x==null?-1:x; y = y==null?-1:y; return sortAsc ? x-y : y-x;
|
||
});
|
||
|
||
// header
|
||
const hh = document.getElementById("mhead"); hh.innerHTML = "";
|
||
for (const col of COLS){
|
||
const th = document.createElement("th"); th.textContent = col.t; if (col.l) th.className="l";
|
||
if (col.k===sortKey){ th.classList.add("active"); if(sortAsc) th.classList.add("asc"); }
|
||
th.onclick = ()=>{ if(sortKey===col.k) sortAsc=!sortAsc; else { sortKey=col.k; sortAsc=false; } render(); };
|
||
hh.appendChild(th);
|
||
}
|
||
// body
|
||
const mb = document.getElementById("mbody"); mb.innerHTML = "";
|
||
for (const m of models){
|
||
const tr = document.createElement("tr"); if (m.model===selModel) tr.className="sel";
|
||
tr.onclick = ()=>{ selModel = (selModel===m.model? null : m.model); render(); };
|
||
for (const col of COLS){
|
||
const td = document.createElement("td"); if (col.l) td.className="l";
|
||
const v = m[col.k];
|
||
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
|
||
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
|
||
if (col.k==="fpPen" && v<0) td.classList.add("bad");
|
||
if (col.k==="fp" && v>0) td.classList.add("warn");
|
||
tr.appendChild(td);
|
||
}
|
||
mb.appendChild(tr);
|
||
}
|
||
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
|
||
document.getElementById("summary").innerHTML =
|
||
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
|
||
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
|
||
|
||
// detail
|
||
const det = selModel ? rows.filter(r=>r.model===selModel) : rows;
|
||
const fb = document.getElementById("fbody"); fb.innerHTML = "";
|
||
const cap = 1000;
|
||
for (const r of det.slice(0, cap)){
|
||
const tr = document.createElement("tr");
|
||
const grade = !r.graded ? '<span class="mut">ungraded</span>'
|
||
: (r.is_real ? `<span class="sev" style="background:${SEVCOLOR[r.severity]||'#333'}">${r.severity||'real'}</span>` : '<span class="bad">false-pos</span>');
|
||
tr.innerHTML =
|
||
`<td class="l mut">${(r.reported_at||"").slice(0,10)}</td><td class="l">${esc(r.repo)}</td><td>${r.pr||""}</td>`+
|
||
`<td class="l">${esc(r.lens)}</td><td class="l">${esc(r.file)}${r.line?":"+r.line:""}</td>`+
|
||
`<td class="l">${esc(r.title)}</td><td class="l">${esc(r.model)}</td><td class="l">${grade}</td>`+
|
||
`<td class="l mut">${esc(r.grader||"")}</td>`;
|
||
fb.appendChild(tr);
|
||
}
|
||
document.getElementById("detcount").textContent =
|
||
`${det.length} finding-report${det.length===1?"":"s"}` + (det.length>cap?` (showing ${cap})`:"");
|
||
}
|
||
function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&","<":"<",">":">"}[m])); }
|
||
|
||
function resetFilters(){
|
||
for (const id of ["from","to","q"]) document.getElementById(id).value="";
|
||
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
|
||
selModel = null; render();
|
||
}
|
||
|
||
document.addEventListener("input", e=>{
|
||
if (e.target.closest("main")) render();
|
||
});
|
||
load();
|
||
</script>
|
||
</body>
|
||
</html>
|