Files
gadfly-reports/ui.html
T
steve c15f860853
Build & push image / build-and-push (push) Successful in 20s
CI / test (push) Successful in 10m20s
feat(ui): solo-find bonus — reward a model for catching what others missed
Adds an editable 'solo-find bonus ×' (default 1.5). A confirmed finding reported by exactly one model (derived from the global reporter count per content-addressed finding — no grader flagging needed) scores severity × bonus. New 'solo' column counts uniquely-caught confirmed findings. Solo-ness is computed over ALL data so the model filter can't fake it. Client-side only; store stays point-free.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 12:24:29 -04:00

345 lines
18 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>gadfly-reports · model performance</title>
<style>
:root { --bg:#0f1115; --panel:#171a21; --line:#262b36; --fg:#e6e9ef; --mut:#9aa4b2; --acc:#7aa2f7; --good:#6ee7a0; --bad:#f7768e; --warn:#e0af68; }
* { box-sizing:border-box; }
body { margin:0; background:var(--bg); color:var(--fg); font:14px/1.45 system-ui,-apple-system,Segoe UI,Roboto,sans-serif; }
header { display:flex; align-items:center; gap:12px; padding:12px 16px; border-bottom:1px solid var(--line); flex-wrap:wrap; }
h1 { font-size:16px; margin:0; font-weight:600; }
h1 .fly { font-size:18px; }
.mut { color:var(--mut); }
.spacer { flex:1; }
main { padding:16px; }
.panel { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:12px; margin-bottom:14px; }
.row { display:flex; flex-wrap:wrap; gap:10px 14px; align-items:flex-end; }
.f { display:flex; flex-direction:column; gap:3px; }
.f label { font-size:11px; text-transform:uppercase; letter-spacing:.04em; color:var(--mut); }
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
input[type=number] { width:64px; }
input[type=date] { width:140px; }
input.search { width:220px; }
button { cursor:pointer; }
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
button.link { background:none; border:none; color:var(--acc); padding:0; text-decoration:underline; }
table { width:100%; border-collapse:collapse; font-variant-numeric:tabular-nums; }
th, td { text-align:right; padding:6px 9px; border-bottom:1px solid var(--line); white-space:nowrap; }
th:first-child, td:first-child, th.l, td.l { text-align:left; }
th { color:var(--mut); font-weight:600; cursor:pointer; user-select:none; position:sticky; top:0; background:var(--panel); }
th.active::after { content:" ▾"; color:var(--acc); }
th.active.asc::after { content:" ▴"; }
tbody tr:hover { background:#1d212b; }
tr.sel { background:#23304d !important; }
.sev { display:inline-block; min-width:14px; padding:0 4px; border-radius:4px; font-size:11px; }
.pill { font-size:11px; padding:1px 6px; border:1px solid var(--line); border-radius:999px; color:var(--mut); cursor:pointer; }
.good { color:var(--good); } .bad { color:var(--bad); } .warn { color:var(--warn); }
.num { font-variant-numeric:tabular-nums; }
.tok { display:inline-flex; gap:6px; align-items:center; }
#err { color:var(--bad); }
details > summary { cursor:pointer; color:var(--mut); }
.small { font-size:12px; }
code { background:#0c0e12; padding:1px 5px; border-radius:4px; }
</style>
</head>
<body>
<header>
<h1><span class="fly">🪰📋</span> gadfly-reports <span class="mut">· model performance</span></h1>
<span class="spacer"></span>
<span id="status" class="mut small"></span>
<div class="tok" id="tokbox" style="display:none">
<input id="token" type="password" placeholder="store bearer token" size="22">
<button class="primary" onclick="saveToken()">connect</button>
</div>
<button onclick="load()">↻ refresh</button>
</header>
<main>
<div id="err"></div>
<div class="panel">
<div class="row">
<div class="f"><label>from</label><input type="date" id="from"></div>
<div class="f"><label>to</label><input type="date" id="to"></div>
<div class="f"><label>repo</label><select id="repo"></select></div>
<div class="f"><label>provider</label><select id="provider"></select></div>
<div class="f"><label>model</label><select id="model"></select></div>
<div class="f"><label>lens</label><select id="lens"></select></div>
<div class="f"><label>grade / severity</label><select id="grade"></select></div>
<div class="f"><label>search (title/file)</label><input class="search" id="q" placeholder="substring…"></div>
<div class="f"><label>&nbsp;</label><button class="link" onclick="resetFilters()">reset</button></div>
</div>
<div class="row" style="margin-top:10px">
<div class="f" style="flex-direction:row;align-items:center;gap:8px">
<label style="text-transform:none">points curve (client-side):</label>
<span class="small mut">trivial</span><input type="number" id="p_trivial" value="1">
<span class="small mut">small</span><input type="number" id="p_small" value="3">
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
<span class="small mut">high</span><input type="number" id="p_high" value="8">
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
<span class="small mut" style="margin-left:18px">solo-find bonus ×</span><input type="number" id="solo_bonus" value="1.5" step="0.5" min="1" title="A confirmed finding that NO other model reported scores this × its severity points — rewarding a model for catching what the swarm missed. 1 = no bonus.">
</div>
</div>
</div>
<div class="panel">
<div id="summary" class="small mut" style="margin-bottom:8px"></div>
<table id="models">
<thead><tr id="mhead"></tr></thead>
<tbody id="mbody"></tbody>
</table>
</div>
<div class="panel">
<details id="detwrap">
<summary><span id="detcount">findings</span> — drill down (click a model row above to scope)</summary>
<table style="margin-top:10px">
<thead><tr>
<th class="l">reported</th><th class="l">repo</th><th>pr</th><th class="l">lens</th>
<th class="l">file:line</th><th class="l">title</th><th class="l">model</th>
<th class="l">grade</th><th class="l">by</th>
</tr></thead>
<tbody id="fbody"></tbody>
</table>
</details>
</div>
</main>
<script>
const SEVS = ["trivial","small","medium","high","critical"];
const SEVCOLOR = { trivial:"#3b4252", small:"#2e4d3a", medium:"#4d4a2e", high:"#5a3b2e", critical:"#5a2e3a" };
let RUNS = [], ROWS = [];
let sortKey = "ptsPerMin", sortAsc = false, selModel = null;
function token(){
const q = new URL(location.href).searchParams.get("token");
if (q) { localStorage.setItem("grt", q); return q; }
return localStorage.getItem("grt") || "";
}
function saveToken(){ localStorage.setItem("grt", document.getElementById("token").value.trim()); load(); }
function needToken(){ document.getElementById("tokbox").style.display = "flex"; }
async function api(path){
const t = token();
const r = await fetch(path, { headers: t ? { "Authorization":"Bearer "+t } : {} });
if (r.status === 401) { needToken(); throw new Error("401 — set a valid token"); }
if (!r.ok) throw new Error(path + " → " + r.status);
return r.json();
}
async function load(){
const err = document.getElementById("err"); err.textContent = "";
document.getElementById("status").textContent = "loading…";
try {
const [runs, rows] = await Promise.all([api("/runs"), api("/export")]);
RUNS = runs || []; ROWS = rows || [];
document.getElementById("tokbox").style.display = "none";
buildFacets(); render();
document.getElementById("status").textContent =
RUNS.length + " runs · " + ROWS.length + " reports";
} catch (e) {
err.textContent = String(e.message || e);
document.getElementById("status").textContent = "";
}
}
function uniq(vals){ return [...new Set(vals.filter(Boolean))].sort(); }
function opt(sel, vals, label){
const cur = sel.value;
sel.innerHTML = "";
const a = document.createElement("option"); a.value = ""; a.textContent = label; sel.appendChild(a);
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
if (vals.includes(cur)) sel.value = cur;
}
function buildFacets(){
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
opt(document.getElementById("lens"), uniq(ROWS.map(r=>r.lens)), "all lenses");
opt(document.getElementById("grade"), ["ungraded","false-positive","confirmed", ...SEVS], "any grade");
}
function curve(){
const c = {};
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
return c;
}
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
function soloBonus(){ const v = parseFloat(document.getElementById("solo_bonus").value); return isNaN(v) ? 1 : v; }
// A false positive has no graded severity, so penalize it by the severity the
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
// louder the wrong cry, the bigger the penalty.
function rawToSevKey(raw){
const s = (raw||"").toLowerCase();
if (s.includes("blocking")) return "high";
if (s.includes("minor")) return "small";
if (s.includes("no material")) return "trivial";
return "medium"; // unknown / "Reviewed"
}
function filters(){
return {
from: document.getElementById("from").value,
to: document.getElementById("to").value,
repo: document.getElementById("repo").value,
provider: document.getElementById("provider").value,
model: document.getElementById("model").value,
lens: document.getElementById("lens").value,
grade: document.getElementById("grade").value,
q: document.getElementById("q").value.trim().toLowerCase(),
};
}
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
function runMatch(r, f){
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
}
function gradeMatch(row, g){
if (!g) return true;
if (g === "ungraded") return !row.graded;
if (g === "false-positive") return row.graded && row.is_real === false;
if (g === "confirmed") return row.graded && row.is_real === true;
return row.graded && row.is_real === true && row.severity === g; // a specific severity
}
function rowMatch(row, f){
if (!dateOK(row.reported_at, f)) return false;
if (f.repo && row.repo!==f.repo) return false;
if (f.provider && row.provider!==f.provider) return false;
if (f.model && row.model!==f.model) return false;
if (f.lens && row.lens!==f.lens) return false;
if (!gradeMatch(row, f.grade)) return false;
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
return true;
}
function aggregate(f){
const c = curve();
// GLOBAL reporter set per finding (ignores filters) — a finding is "solo" when
// exactly one model ever reported it, so the model filter can't fake solo-ness.
const reporters = new Map();
for (const r of ROWS){ if(!reporters.has(r.finding_id)) reporters.set(r.finding_id, new Set()); reporters.get(r.finding_id).add(r.model); }
const M = new Map();
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
const rows = ROWS.filter(r => rowMatch(r, f));
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
else { m.ungraded.add(r.finding_id); }
}
const fpm = fpMult(), sb = soloBonus();
const out = [...M.values()].map(m => {
const sevCounts = Object.fromEntries(SEVS.map(s=>[s,0]));
let confirmedPoints = 0, solo = 0;
for (const [fid, sevv] of m.confirmed){
if (sevCounts[sevv] !== undefined) sevCounts[sevv]++;
const isSolo = (reporters.get(fid)?.size || 1) === 1; // only this model ever reported it
if (isSolo) solo++;
confirmedPoints += (c[sevv] || 0) * (isSolo ? sb : 1);
}
let fpPen = 0; for (const k of m.fp.values()) fpPen += (c[k]||0) * fpm; // negative when fpm<0
const points = confirmedPoints + fpPen; // NET: solo-boosted confirmed + FP penalty
const findings = m.findings.size, confirmed = m.confirmed.size;
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
inTok:m.inTok, outTok:m.outTok, findings, confirmed, solo, fp:m.fp.size, ungraded:m.ungraded.size,
sev:sevCounts, confirmedPoints, fpPen, points,
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
ptsPerRun: m.runs>0 ? points/m.runs : null,
confirmedPct: findings>0 ? confirmed/findings*100 : null };
}).filter(m => m.runs>0 || m.findings>0);
return { models: out, rows };
}
const COLS = [
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"solo", t:"solo"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
];
function render(){
const f = filters();
const { models, rows } = aggregate(f);
models.sort((a,b)=>{
let x=a[sortKey], y=b[sortKey];
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
x = x==null?-1:x; y = y==null?-1:y; return sortAsc ? x-y : y-x;
});
// header
const hh = document.getElementById("mhead"); hh.innerHTML = "";
for (const col of COLS){
const th = document.createElement("th"); th.textContent = col.t; if (col.l) th.className="l";
if (col.k===sortKey){ th.classList.add("active"); if(sortAsc) th.classList.add("asc"); }
th.onclick = ()=>{ if(sortKey===col.k) sortAsc=!sortAsc; else { sortKey=col.k; sortAsc=false; } render(); };
hh.appendChild(th);
}
// body
const mb = document.getElementById("mbody"); mb.innerHTML = "";
for (const m of models){
const tr = document.createElement("tr"); if (m.model===selModel) tr.className="sel";
tr.onclick = ()=>{ selModel = (selModel===m.model? null : m.model); render(); };
for (const col of COLS){
const td = document.createElement("td"); if (col.l) td.className="l";
const v = m[col.k];
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
if (col.k==="fpPen" && v<0) td.classList.add("bad");
if (col.k==="solo" && v>0) td.classList.add("good");
if (col.k==="fp" && v>0) td.classList.add("warn");
tr.appendChild(td);
}
mb.appendChild(tr);
}
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
document.getElementById("summary").innerHTML =
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
// detail
const det = selModel ? rows.filter(r=>r.model===selModel) : rows;
const fb = document.getElementById("fbody"); fb.innerHTML = "";
const cap = 1000;
for (const r of det.slice(0, cap)){
const tr = document.createElement("tr");
const grade = !r.graded ? '<span class="mut">ungraded</span>'
: (r.is_real ? `<span class="sev" style="background:${SEVCOLOR[r.severity]||'#333'}">${r.severity||'real'}</span>` : '<span class="bad">false-pos</span>');
tr.innerHTML =
`<td class="l mut">${(r.reported_at||"").slice(0,10)}</td><td class="l">${esc(r.repo)}</td><td>${r.pr||""}</td>`+
`<td class="l">${esc(r.lens)}</td><td class="l">${esc(r.file)}${r.line?":"+r.line:""}</td>`+
`<td class="l">${esc(r.title)}</td><td class="l">${esc(r.model)}</td><td class="l">${grade}</td>`+
`<td class="l mut">${esc(r.grader||"")}</td>`;
fb.appendChild(tr);
}
document.getElementById("detcount").textContent =
`${det.length} finding-report${det.length===1?"":"s"}` + (det.length>cap?` (showing ${cap})`:"");
}
function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;","<":"&lt;",">":"&gt;"}[m])); }
function resetFilters(){
for (const id of ["from","to","q"]) document.getElementById(id).value="";
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
selModel = null; render();
}
document.addEventListener("input", e=>{
if (e.target.closest("main")) render();
});
load();
</script>
</body>
</html>