Files
gadfly-reports/ui.html
T
steve 14cbee8e25
Build & push image / build-and-push (push) Successful in 20s
CI / test (push) Successful in 10m22s
feat: solo-error penalty + fast healthcheck (instant Traefik restart)
Dashboard: add an editable 'solo-error penalty ×' (default 1.5) — a false positive only one model made (a unique wrong claim, derived from reporter count) multiplies its FP penalty, mirroring the solo-find bonus. Client-side; store stays point-free.

Deploy: speed up the healthcheck (image HEALTHCHECK + compose example: interval 30s->5s, start_period 10s, start_interval 1s). Traefik gates routing on the Docker health status, so the old 30s-to-first-probe meant ~30s of 502s after a restart; the daemon binds the port in ms, so it now goes healthy in ~1s. Data is on the volume; only fire-and-forget emits in the ~1s window are at risk.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 12:45:07 -04:00

347 lines
18 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>gadfly-reports · model performance</title>
<style>
:root { --bg:#0f1115; --panel:#171a21; --line:#262b36; --fg:#e6e9ef; --mut:#9aa4b2; --acc:#7aa2f7; --good:#6ee7a0; --bad:#f7768e; --warn:#e0af68; }
* { box-sizing:border-box; }
body { margin:0; background:var(--bg); color:var(--fg); font:14px/1.45 system-ui,-apple-system,Segoe UI,Roboto,sans-serif; }
header { display:flex; align-items:center; gap:12px; padding:12px 16px; border-bottom:1px solid var(--line); flex-wrap:wrap; }
h1 { font-size:16px; margin:0; font-weight:600; }
h1 .fly { font-size:18px; }
.mut { color:var(--mut); }
.spacer { flex:1; }
main { padding:16px; }
.panel { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:12px; margin-bottom:14px; }
.row { display:flex; flex-wrap:wrap; gap:10px 14px; align-items:flex-end; }
.f { display:flex; flex-direction:column; gap:3px; }
.f label { font-size:11px; text-transform:uppercase; letter-spacing:.04em; color:var(--mut); }
input, select, button { background:#0c0e12; color:var(--fg); border:1px solid var(--line); border-radius:6px; padding:6px 8px; font:inherit; }
input[type=number] { width:64px; }
input[type=date] { width:140px; }
input.search { width:220px; }
button { cursor:pointer; }
button.primary { background:var(--acc); color:#0c0e12; border-color:var(--acc); font-weight:600; }
button.link { background:none; border:none; color:var(--acc); padding:0; text-decoration:underline; }
table { width:100%; border-collapse:collapse; font-variant-numeric:tabular-nums; }
th, td { text-align:right; padding:6px 9px; border-bottom:1px solid var(--line); white-space:nowrap; }
th:first-child, td:first-child, th.l, td.l { text-align:left; }
th { color:var(--mut); font-weight:600; cursor:pointer; user-select:none; position:sticky; top:0; background:var(--panel); }
th.active::after { content:" ▾"; color:var(--acc); }
th.active.asc::after { content:" ▴"; }
tbody tr:hover { background:#1d212b; }
tr.sel { background:#23304d !important; }
.sev { display:inline-block; min-width:14px; padding:0 4px; border-radius:4px; font-size:11px; }
.pill { font-size:11px; padding:1px 6px; border:1px solid var(--line); border-radius:999px; color:var(--mut); cursor:pointer; }
.good { color:var(--good); } .bad { color:var(--bad); } .warn { color:var(--warn); }
.num { font-variant-numeric:tabular-nums; }
.tok { display:inline-flex; gap:6px; align-items:center; }
#err { color:var(--bad); }
details > summary { cursor:pointer; color:var(--mut); }
.small { font-size:12px; }
code { background:#0c0e12; padding:1px 5px; border-radius:4px; }
</style>
</head>
<body>
<header>
<h1><span class="fly">🪰📋</span> gadfly-reports <span class="mut">· model performance</span></h1>
<span class="spacer"></span>
<span id="status" class="mut small"></span>
<div class="tok" id="tokbox" style="display:none">
<input id="token" type="password" placeholder="store bearer token" size="22">
<button class="primary" onclick="saveToken()">connect</button>
</div>
<button onclick="load()">↻ refresh</button>
</header>
<main>
<div id="err"></div>
<div class="panel">
<div class="row">
<div class="f"><label>from</label><input type="date" id="from"></div>
<div class="f"><label>to</label><input type="date" id="to"></div>
<div class="f"><label>repo</label><select id="repo"></select></div>
<div class="f"><label>provider</label><select id="provider"></select></div>
<div class="f"><label>model</label><select id="model"></select></div>
<div class="f"><label>lens</label><select id="lens"></select></div>
<div class="f"><label>grade / severity</label><select id="grade"></select></div>
<div class="f"><label>search (title/file)</label><input class="search" id="q" placeholder="substring…"></div>
<div class="f"><label>&nbsp;</label><button class="link" onclick="resetFilters()">reset</button></div>
</div>
<div class="row" style="margin-top:10px">
<div class="f" style="flex-direction:row;align-items:center;gap:8px">
<label style="text-transform:none">points curve (client-side):</label>
<span class="small mut">trivial</span><input type="number" id="p_trivial" value="1">
<span class="small mut">small</span><input type="number" id="p_small" value="3">
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
<span class="small mut">high</span><input type="number" id="p_high" value="8">
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
<span class="small mut" style="margin-left:18px">solo-find bonus ×</span><input type="number" id="solo_bonus" value="1.5" step="0.5" min="1" title="A confirmed finding that NO other model reported scores this × its severity points — rewarding a model for catching what the swarm missed. 1 = no bonus.">
<span class="small mut" style="margin-left:18px">solo-error penalty ×</span><input type="number" id="solo_err" value="1.5" step="0.5" min="1" title="A false positive that NO other model made (a unique wrong claim) multiplies its FP penalty by this — noisier than a shared mistake. 1 = no extra penalty.">
</div>
</div>
</div>
<div class="panel">
<div id="summary" class="small mut" style="margin-bottom:8px"></div>
<table id="models">
<thead><tr id="mhead"></tr></thead>
<tbody id="mbody"></tbody>
</table>
</div>
<div class="panel">
<details id="detwrap">
<summary><span id="detcount">findings</span> — drill down (click a model row above to scope)</summary>
<table style="margin-top:10px">
<thead><tr>
<th class="l">reported</th><th class="l">repo</th><th>pr</th><th class="l">lens</th>
<th class="l">file:line</th><th class="l">title</th><th class="l">model</th>
<th class="l">grade</th><th class="l">by</th>
</tr></thead>
<tbody id="fbody"></tbody>
</table>
</details>
</div>
</main>
<script>
const SEVS = ["trivial","small","medium","high","critical"];
const SEVCOLOR = { trivial:"#3b4252", small:"#2e4d3a", medium:"#4d4a2e", high:"#5a3b2e", critical:"#5a2e3a" };
let RUNS = [], ROWS = [];
let sortKey = "ptsPerMin", sortAsc = false, selModel = null;
function token(){
const q = new URL(location.href).searchParams.get("token");
if (q) { localStorage.setItem("grt", q); return q; }
return localStorage.getItem("grt") || "";
}
function saveToken(){ localStorage.setItem("grt", document.getElementById("token").value.trim()); load(); }
function needToken(){ document.getElementById("tokbox").style.display = "flex"; }
async function api(path){
const t = token();
const r = await fetch(path, { headers: t ? { "Authorization":"Bearer "+t } : {} });
if (r.status === 401) { needToken(); throw new Error("401 — set a valid token"); }
if (!r.ok) throw new Error(path + " → " + r.status);
return r.json();
}
async function load(){
const err = document.getElementById("err"); err.textContent = "";
document.getElementById("status").textContent = "loading…";
try {
const [runs, rows] = await Promise.all([api("/runs"), api("/export")]);
RUNS = runs || []; ROWS = rows || [];
document.getElementById("tokbox").style.display = "none";
buildFacets(); render();
document.getElementById("status").textContent =
RUNS.length + " runs · " + ROWS.length + " reports";
} catch (e) {
err.textContent = String(e.message || e);
document.getElementById("status").textContent = "";
}
}
function uniq(vals){ return [...new Set(vals.filter(Boolean))].sort(); }
function opt(sel, vals, label){
const cur = sel.value;
sel.innerHTML = "";
const a = document.createElement("option"); a.value = ""; a.textContent = label; sel.appendChild(a);
for (const v of vals){ const o = document.createElement("option"); o.value = v; o.textContent = v; sel.appendChild(o); }
if (vals.includes(cur)) sel.value = cur;
}
function buildFacets(){
opt(document.getElementById("repo"), uniq([...RUNS.map(r=>r.repo), ...ROWS.map(r=>r.repo)]), "all repos");
opt(document.getElementById("provider"), uniq([...RUNS.map(r=>r.provider), ...ROWS.map(r=>r.provider)]), "all providers");
opt(document.getElementById("model"), uniq([...RUNS.map(r=>r.model), ...ROWS.map(r=>r.model)]), "all models");
opt(document.getElementById("lens"), uniq(ROWS.map(r=>r.lens)), "all lenses");
opt(document.getElementById("grade"), ["ungraded","false-positive","confirmed", ...SEVS], "any grade");
}
function curve(){
const c = {};
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
return c;
}
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
function soloBonus(){ const v = parseFloat(document.getElementById("solo_bonus").value); return isNaN(v) ? 1 : v; }
function soloErr(){ const v = parseFloat(document.getElementById("solo_err").value); return isNaN(v) ? 1 : v; }
// A false positive has no graded severity, so penalize it by the severity the
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
// louder the wrong cry, the bigger the penalty.
function rawToSevKey(raw){
const s = (raw||"").toLowerCase();
if (s.includes("blocking")) return "high";
if (s.includes("minor")) return "small";
if (s.includes("no material")) return "trivial";
return "medium"; // unknown / "Reviewed"
}
function filters(){
return {
from: document.getElementById("from").value,
to: document.getElementById("to").value,
repo: document.getElementById("repo").value,
provider: document.getElementById("provider").value,
model: document.getElementById("model").value,
lens: document.getElementById("lens").value,
grade: document.getElementById("grade").value,
q: document.getElementById("q").value.trim().toLowerCase(),
};
}
function dateOK(ts, f){ const d = (ts||"").slice(0,10); return (!f.from || d >= f.from) && (!f.to || d <= f.to); }
// run-level filters only (date/repo/provider/model) — severity/lens/search are finding-level.
function runMatch(r, f){
return dateOK(r.created_at, f) && (!f.repo || r.repo===f.repo) &&
(!f.provider || r.provider===f.provider) && (!f.model || r.model===f.model);
}
function gradeMatch(row, g){
if (!g) return true;
if (g === "ungraded") return !row.graded;
if (g === "false-positive") return row.graded && row.is_real === false;
if (g === "confirmed") return row.graded && row.is_real === true;
return row.graded && row.is_real === true && row.severity === g; // a specific severity
}
function rowMatch(row, f){
if (!dateOK(row.reported_at, f)) return false;
if (f.repo && row.repo!==f.repo) return false;
if (f.provider && row.provider!==f.provider) return false;
if (f.model && row.model!==f.model) return false;
if (f.lens && row.lens!==f.lens) return false;
if (!gradeMatch(row, f.grade)) return false;
if (f.q && !((row.title||"")+" "+(row.file||"")+" "+(row.repo||"")).toLowerCase().includes(f.q)) return false;
return true;
}
function aggregate(f){
const c = curve();
// GLOBAL reporter set per finding (ignores filters) — a finding is "solo" when
// exactly one model ever reported it, so the model filter can't fake solo-ness.
const reporters = new Map();
for (const r of ROWS){ if(!reporters.has(r.finding_id)) reporters.set(r.finding_id, new Set()); reporters.get(r.finding_id).add(r.model); }
const M = new Map();
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
const rows = ROWS.filter(r => rowMatch(r, f));
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
else { m.ungraded.add(r.finding_id); }
}
const fpm = fpMult(), sb = soloBonus(), se = soloErr();
const out = [...M.values()].map(m => {
const sevCounts = Object.fromEntries(SEVS.map(s=>[s,0]));
let confirmedPoints = 0, solo = 0;
for (const [fid, sevv] of m.confirmed){
if (sevCounts[sevv] !== undefined) sevCounts[sevv]++;
const isSolo = (reporters.get(fid)?.size || 1) === 1; // only this model ever reported it
if (isSolo) solo++;
confirmedPoints += (c[sevv] || 0) * (isSolo ? sb : 1);
}
let fpPen = 0; for (const [fid, k] of m.fp){ const soloE = (reporters.get(fid)?.size || 1) === 1; fpPen += (c[k]||0) * fpm * (soloE ? se : 1); } // solo (unique) errors penalized extra
const points = confirmedPoints + fpPen; // NET: solo-boosted confirmed + FP penalty
const findings = m.findings.size, confirmed = m.confirmed.size;
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
inTok:m.inTok, outTok:m.outTok, findings, confirmed, solo, fp:m.fp.size, ungraded:m.ungraded.size,
sev:sevCounts, confirmedPoints, fpPen, points,
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
ptsPerRun: m.runs>0 ? points/m.runs : null,
confirmedPct: findings>0 ? confirmed/findings*100 : null };
}).filter(m => m.runs>0 || m.findings>0);
return { models: out, rows };
}
const COLS = [
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"solo", t:"solo"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
];
function render(){
const f = filters();
const { models, rows } = aggregate(f);
models.sort((a,b)=>{
let x=a[sortKey], y=b[sortKey];
if (sortKey==="model"||sortKey==="provider"){ x=x||""; y=y||""; return sortAsc ? x.localeCompare(y) : y.localeCompare(x); }
x = x==null?-1:x; y = y==null?-1:y; return sortAsc ? x-y : y-x;
});
// header
const hh = document.getElementById("mhead"); hh.innerHTML = "";
for (const col of COLS){
const th = document.createElement("th"); th.textContent = col.t; if (col.l) th.className="l";
if (col.k===sortKey){ th.classList.add("active"); if(sortAsc) th.classList.add("asc"); }
th.onclick = ()=>{ if(sortKey===col.k) sortAsc=!sortAsc; else { sortKey=col.k; sortAsc=false; } render(); };
hh.appendChild(th);
}
// body
const mb = document.getElementById("mbody"); mb.innerHTML = "";
for (const m of models){
const tr = document.createElement("tr"); if (m.model===selModel) tr.className="sel";
tr.onclick = ()=>{ selModel = (selModel===m.model? null : m.model); render(); };
for (const col of COLS){
const td = document.createElement("td"); if (col.l) td.className="l";
const v = m[col.k];
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
if (col.k==="fpPen" && v<0) td.classList.add("bad");
if (col.k==="solo" && v>0) td.classList.add("good");
if (col.k==="fp" && v>0) td.classList.add("warn");
tr.appendChild(td);
}
mb.appendChild(tr);
}
const tot = models.reduce((a,m)=>({runs:a.runs+m.runs, min:a.min+m.minutes, find:a.find+m.findings, conf:a.conf+m.confirmed, pts:a.pts+m.points}), {runs:0,min:0,find:0,conf:0,pts:0});
document.getElementById("summary").innerHTML =
`${models.length} models · ${tot.runs} runs · ${tot.min.toFixed(0)} min · ${tot.find} findings · ${tot.conf} confirmed · ${tot.pts.toFixed(0)} pts` +
(selModel ? ` · <b>scoped to ${selModel}</b> <span class="pill" onclick="event.stopPropagation();selModel=null;render()">clear</span>` : "");
// detail
const det = selModel ? rows.filter(r=>r.model===selModel) : rows;
const fb = document.getElementById("fbody"); fb.innerHTML = "";
const cap = 1000;
for (const r of det.slice(0, cap)){
const tr = document.createElement("tr");
const grade = !r.graded ? '<span class="mut">ungraded</span>'
: (r.is_real ? `<span class="sev" style="background:${SEVCOLOR[r.severity]||'#333'}">${r.severity||'real'}</span>` : '<span class="bad">false-pos</span>');
tr.innerHTML =
`<td class="l mut">${(r.reported_at||"").slice(0,10)}</td><td class="l">${esc(r.repo)}</td><td>${r.pr||""}</td>`+
`<td class="l">${esc(r.lens)}</td><td class="l">${esc(r.file)}${r.line?":"+r.line:""}</td>`+
`<td class="l">${esc(r.title)}</td><td class="l">${esc(r.model)}</td><td class="l">${grade}</td>`+
`<td class="l mut">${esc(r.grader||"")}</td>`;
fb.appendChild(tr);
}
document.getElementById("detcount").textContent =
`${det.length} finding-report${det.length===1?"":"s"}` + (det.length>cap?` (showing ${cap})`:"");
}
function esc(s){ return (s==null?"":String(s)).replace(/[&<>]/g, m=>({"&":"&amp;","<":"&lt;",">":"&gt;"}[m])); }
function resetFilters(){
for (const id of ["from","to","q"]) document.getElementById(id).value="";
for (const id of ["repo","provider","model","lens","grade"]) document.getElementById(id).value="";
selModel = null; render();
}
document.addEventListener("input", e=>{
if (e.target.closest("main")) render();
});
load();
</script>
</body>
</html>