feat(ui): false-positive penalty (severity-scaled, default -0.5)
Adds an editable 'false-positive penalty ×' to the dashboard. A false positive carries no graded severity, so it's penalized by the severity the model CLAIMED (its lens verdict / raw_severity, mapped onto the curve: Blocking->high, Minor->small). points(net) = confirmed points + Σ penalty×points[claimed], so a model with a few good finds but many false positives nets down — even negative — and sorts to the bottom. Adds an 'fp pen' column; net points/pts-min/pts-run shown red when negative. Client-side only; the store stays point-free. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -131,6 +131,12 @@ points curve (default `trivial=1, small=3, medium=5, high=8, critical=20`) and c
|
|||||||
`points = Σ weight[severity]·count` and `value/min = points / minutes` on the fly — retune it without
|
`points = Σ weight[severity]·count` and `value/min = points / minutes` on the fly — retune it without
|
||||||
touching stored data.
|
touching stored data.
|
||||||
|
|
||||||
|
There's also an editable **false-positive penalty ×** (default `-0.5`). A false positive has no
|
||||||
|
graded severity, so it's penalized by the severity the model **claimed** (its lens verdict —
|
||||||
|
Blocking→high, Minor→small): `penalty × points[claimed]`. So a Blocking-claimed FP at `-0.5` costs
|
||||||
|
`high(8) × -0.5 = -4`, and a model with the odd good find but many false positives nets *down* —
|
||||||
|
even negative — instead of coasting on its hits.
|
||||||
|
|
||||||
Auth: the `/ui` shell is public (it holds no data); paste the store token into its **connect** box,
|
Auth: the `/ui` shell is public (it holds no data); paste the store token into its **connect** box,
|
||||||
or open `/ui?token=<token>` once (remembered in `localStorage`). Prefer your own dashboard? Point
|
or open `/ui?token=<token>` once (remembered in `localStorage`). Prefer your own dashboard? Point
|
||||||
Grafana/Metabase/etc. at the SQLite file or the same `/export` + `/scoreboard` + `/runs` JSON.
|
Grafana/Metabase/etc. at the SQLite file or the same `/export` + `/scoreboard` + `/runs` JSON.
|
||||||
|
|||||||
@@ -79,6 +79,7 @@
|
|||||||
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
|
<span class="small mut">medium</span><input type="number" id="p_medium" value="5">
|
||||||
<span class="small mut">high</span><input type="number" id="p_high" value="8">
|
<span class="small mut">high</span><input type="number" id="p_high" value="8">
|
||||||
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
|
<span class="small mut">critical</span><input type="number" id="p_critical" value="20">
|
||||||
|
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -164,6 +165,17 @@ function curve(){
|
|||||||
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
|
for (const s of SEVS) c[s] = parseFloat(document.getElementById("p_"+s).value) || 0;
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
|
||||||
|
// A false positive has no graded severity, so penalize it by the severity the
|
||||||
|
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
|
||||||
|
// louder the wrong cry, the bigger the penalty.
|
||||||
|
function rawToSevKey(raw){
|
||||||
|
const s = (raw||"").toLowerCase();
|
||||||
|
if (s.includes("blocking")) return "high";
|
||||||
|
if (s.includes("minor")) return "small";
|
||||||
|
if (s.includes("no material")) return "trivial";
|
||||||
|
return "medium"; // unknown / "Reviewed"
|
||||||
|
}
|
||||||
function filters(){
|
function filters(){
|
||||||
return {
|
return {
|
||||||
from: document.getElementById("from").value,
|
from: document.getElementById("from").value,
|
||||||
@@ -204,7 +216,7 @@ function aggregate(f){
|
|||||||
const c = curve();
|
const c = curve();
|
||||||
const M = new Map();
|
const M = new Map();
|
||||||
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
|
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
|
||||||
findings:new Set(), confirmed:new Set(), fp:new Set(), ungraded:new Set(), sev:Object.fromEntries(SEVS.map(s=>[s,new Set()]))}); return M.get(m); };
|
findings:new Set(), confirmed:new Set(), fp:new Map(), ungraded:new Set(), sev:Object.fromEntries(SEVS.map(s=>[s,new Set()]))}); return M.get(m); };
|
||||||
|
|
||||||
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
|
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
|
||||||
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
|
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
|
||||||
@@ -213,17 +225,20 @@ function aggregate(f){
|
|||||||
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
|
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
|
||||||
m.findings.add(r.finding_id);
|
m.findings.add(r.finding_id);
|
||||||
if (r.graded && r.is_real === true){ m.confirmed.add(r.finding_id); if (r.severity) m.sev[r.severity].add(r.finding_id); }
|
if (r.graded && r.is_real === true){ m.confirmed.add(r.finding_id); if (r.severity) m.sev[r.severity].add(r.finding_id); }
|
||||||
else if (r.graded && r.is_real === false){ m.fp.add(r.finding_id); }
|
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
|
||||||
else { m.ungraded.add(r.finding_id); }
|
else { m.ungraded.add(r.finding_id); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const fpm = fpMult();
|
||||||
const out = [...M.values()].map(m => {
|
const out = [...M.values()].map(m => {
|
||||||
const sevCounts = Object.fromEntries(SEVS.map(s=>[s, m.sev[s].size]));
|
const sevCounts = Object.fromEntries(SEVS.map(s=>[s, m.sev[s].size]));
|
||||||
const points = SEVS.reduce((a,s)=> a + c[s]*sevCounts[s], 0);
|
const confirmedPoints = SEVS.reduce((a,s)=> a + c[s]*sevCounts[s], 0);
|
||||||
|
let fpPen = 0; for (const k of m.fp.values()) fpPen += (c[k]||0) * fpm; // negative when fpm<0
|
||||||
|
const points = confirmedPoints + fpPen; // NET of the false-positive penalty
|
||||||
const findings = m.findings.size, confirmed = m.confirmed.size;
|
const findings = m.findings.size, confirmed = m.confirmed.size;
|
||||||
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
|
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
|
||||||
inTok:m.inTok, outTok:m.outTok, findings, confirmed, fp:m.fp.size, ungraded:m.ungraded.size,
|
inTok:m.inTok, outTok:m.outTok, findings, confirmed, fp:m.fp.size, ungraded:m.ungraded.size,
|
||||||
sev:sevCounts, points,
|
sev:sevCounts, confirmedPoints, fpPen, points,
|
||||||
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
|
ptsPerMin: m.minutes>0 ? points/m.minutes : null,
|
||||||
ptsPerRun: m.runs>0 ? points/m.runs : null,
|
ptsPerRun: m.runs>0 ? points/m.runs : null,
|
||||||
confirmedPct: findings>0 ? confirmed/findings*100 : null };
|
confirmedPct: findings>0 ? confirmed/findings*100 : null };
|
||||||
@@ -236,7 +251,8 @@ const COLS = [
|
|||||||
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
|
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
|
||||||
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
|
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
|
||||||
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
|
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
|
||||||
{k:"points", t:"points", fmt:v=>v.toFixed(0)},
|
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
|
||||||
|
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
|
||||||
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
|
{k:"ptsPerMin", t:"pts/min", fmt:v=>v==null?"—":v.toFixed(2)},
|
||||||
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
|
{k:"ptsPerRun", t:"pts/run", fmt:v=>v==null?"—":v.toFixed(1)},
|
||||||
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
|
{k:"sev", t:"by severity", l:true, fmt:sev=>SEVS.filter(s=>sev[s]).map(s=>`<span class="sev" style="background:${SEVCOLOR[s]}">${s[0].toUpperCase()}${sev[s]}</span>`).join(" ")||"—"},
|
||||||
@@ -268,8 +284,9 @@ function render(){
|
|||||||
const td = document.createElement("td"); if (col.l) td.className="l";
|
const td = document.createElement("td"); if (col.l) td.className="l";
|
||||||
const v = m[col.k];
|
const v = m[col.k];
|
||||||
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
|
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
|
||||||
if (col.k==="ptsPerMin" && v!=null) td.classList.add("good");
|
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
|
||||||
if (col.k==="fp" && v>0) td.classList.add("bad");
|
if (col.k==="fpPen" && v<0) td.classList.add("bad");
|
||||||
|
if (col.k==="fp" && v>0) td.classList.add("warn");
|
||||||
tr.appendChild(td);
|
tr.appendChild(td);
|
||||||
}
|
}
|
||||||
mb.appendChild(tr);
|
mb.appendChild(tr);
|
||||||
|
|||||||
Reference in New Issue
Block a user