feat(ui): solo-find bonus — reward a model for catching what others missed
Build & push image / build-and-push (push) Successful in 20s
CI / test (push) Successful in 10m20s

Adds an editable 'solo-find bonus ×' (default 1.5). A confirmed finding reported by exactly one model (derived from the global reporter count per content-addressed finding — no grader flagging needed) scores severity × bonus. New 'solo' column counts uniquely-caught confirmed findings. Solo-ness is computed over ALL data so the model filter can't fake it. Client-side only; store stays point-free.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-27 12:24:29 -04:00
parent 0cb6b25f11
commit c15f860853
2 changed files with 27 additions and 8 deletions
+6
View File
@@ -137,6 +137,12 @@ Blocking→high, Minor→small): `penalty × points[claimed]`. So a Blocking-cla
`high(8) × -0.5 = -4`, and a model with the odd good find but many false positives nets *down* `high(8) × -0.5 = -4`, and a model with the odd good find but many false positives nets *down*
even negative — instead of coasting on its hits. even negative — instead of coasting on its hits.
And an editable **solo-find bonus ×** (default `1.5`). Because findings are content-addressed, the
number of models that reported one is known, so a confirmed finding that **only that model** caught
(no other model reported it) scores `severity × bonus` — rewarding catching what the swarm missed.
The `solo` column counts those. This is derived from the data (reporter count); the grader never has
to flag it. Set the bonus to `1` to disable.
Auth: the `/ui` shell is public (it holds no data); paste the store token into its **connect** box, Auth: the `/ui` shell is public (it holds no data); paste the store token into its **connect** box,
or open `/ui?token=<token>` once (remembered in `localStorage`). Prefer your own dashboard? Point or open `/ui?token=<token>` once (remembered in `localStorage`). Prefer your own dashboard? Point
Grafana/Metabase/etc. at the SQLite file or the same `/export` + `/scoreboard` + `/runs` JSON. Grafana/Metabase/etc. at the SQLite file or the same `/export` + `/scoreboard` + `/runs` JSON.
+21 -8
View File
@@ -80,6 +80,7 @@
<span class="small mut">high</span><input type="number" id="p_high" value="8"> <span class="small mut">high</span><input type="number" id="p_high" value="8">
<span class="small mut">critical</span><input type="number" id="p_critical" value="20"> <span class="small mut">critical</span><input type="number" id="p_critical" value="20">
<span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts."> <span class="small mut" style="margin-left:18px">false-positive penalty ×</span><input type="number" id="fp_mult" value="-0.5" step="0.5" title="A false positive scores this × the severity the model CLAIMED (its lens verdict). e.g. a Blocking-claimed FP at -0.5 = high(8) × -0.5 = -4 pts.">
<span class="small mut" style="margin-left:18px">solo-find bonus ×</span><input type="number" id="solo_bonus" value="1.5" step="0.5" min="1" title="A confirmed finding that NO other model reported scores this × its severity points — rewarding a model for catching what the swarm missed. 1 = no bonus.">
</div> </div>
</div> </div>
</div> </div>
@@ -166,6 +167,7 @@ function curve(){
return c; return c;
} }
function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; } function fpMult(){ const v = parseFloat(document.getElementById("fp_mult").value); return isNaN(v) ? 0 : v; }
function soloBonus(){ const v = parseFloat(document.getElementById("solo_bonus").value); return isNaN(v) ? 1 : v; }
// A false positive has no graded severity, so penalize it by the severity the // A false positive has no graded severity, so penalize it by the severity the
// MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The // MODEL claimed — its lens verdict (raw_severity) — mapped onto the curve. The
// louder the wrong cry, the bigger the penalty. // louder the wrong cry, the bigger the penalty.
@@ -214,9 +216,13 @@ function rowMatch(row, f){
function aggregate(f){ function aggregate(f){
const c = curve(); const c = curve();
// GLOBAL reporter set per finding (ignores filters) — a finding is "solo" when
// exactly one model ever reported it, so the model filter can't fake solo-ness.
const reporters = new Map();
for (const r of ROWS){ if(!reporters.has(r.finding_id)) reporters.set(r.finding_id, new Set()); reporters.get(r.finding_id).add(r.model); }
const M = new Map(); const M = new Map();
const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0, const get = m => { if(!M.has(m)) M.set(m, {model:m, provider:"", runs:0, minutes:0, inTok:0, outTok:0,
findings:new Set(), confirmed:new Set(), fp:new Map(), ungraded:new Set(), sev:Object.fromEntries(SEVS.map(s=>[s,new Set()]))}); return M.get(m); }; findings:new Set(), confirmed:new Map(), fp:new Map(), ungraded:new Set()}); return M.get(m); };
for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60; for (const r of RUNS){ if(!runMatch(r,f)) continue; const m=get(r.model); m.runs++; m.minutes += (r.duration_secs||0)/60;
m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; } m.inTok += r.input_tokens||0; m.outTok += r.output_tokens||0; if(r.provider) m.provider=r.provider; }
@@ -224,20 +230,26 @@ function aggregate(f){
const rows = ROWS.filter(r => rowMatch(r, f)); const rows = ROWS.filter(r => rowMatch(r, f));
for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider; for (const r of rows){ const m=get(r.model); if(r.provider) m.provider=m.provider||r.provider;
m.findings.add(r.finding_id); m.findings.add(r.finding_id);
if (r.graded && r.is_real === true){ m.confirmed.add(r.finding_id); if (r.severity) m.sev[r.severity].add(r.finding_id); } if (r.graded && r.is_real === true){ m.confirmed.set(r.finding_id, r.severity || ""); }
else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); } else if (r.graded && r.is_real === false){ m.fp.set(r.finding_id, rawToSevKey(r.raw_severity)); }
else { m.ungraded.add(r.finding_id); } else { m.ungraded.add(r.finding_id); }
} }
const fpm = fpMult(); const fpm = fpMult(), sb = soloBonus();
const out = [...M.values()].map(m => { const out = [...M.values()].map(m => {
const sevCounts = Object.fromEntries(SEVS.map(s=>[s, m.sev[s].size])); const sevCounts = Object.fromEntries(SEVS.map(s=>[s,0]));
const confirmedPoints = SEVS.reduce((a,s)=> a + c[s]*sevCounts[s], 0); let confirmedPoints = 0, solo = 0;
for (const [fid, sevv] of m.confirmed){
if (sevCounts[sevv] !== undefined) sevCounts[sevv]++;
const isSolo = (reporters.get(fid)?.size || 1) === 1; // only this model ever reported it
if (isSolo) solo++;
confirmedPoints += (c[sevv] || 0) * (isSolo ? sb : 1);
}
let fpPen = 0; for (const k of m.fp.values()) fpPen += (c[k]||0) * fpm; // negative when fpm<0 let fpPen = 0; for (const k of m.fp.values()) fpPen += (c[k]||0) * fpm; // negative when fpm<0
const points = confirmedPoints + fpPen; // NET of the false-positive penalty const points = confirmedPoints + fpPen; // NET: solo-boosted confirmed + FP penalty
const findings = m.findings.size, confirmed = m.confirmed.size; const findings = m.findings.size, confirmed = m.confirmed.size;
return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes, return { model:m.model, provider:m.provider, runs:m.runs, minutes:m.minutes,
inTok:m.inTok, outTok:m.outTok, findings, confirmed, fp:m.fp.size, ungraded:m.ungraded.size, inTok:m.inTok, outTok:m.outTok, findings, confirmed, solo, fp:m.fp.size, ungraded:m.ungraded.size,
sev:sevCounts, confirmedPoints, fpPen, points, sev:sevCounts, confirmedPoints, fpPen, points,
ptsPerMin: m.minutes>0 ? points/m.minutes : null, ptsPerMin: m.minutes>0 ? points/m.minutes : null,
ptsPerRun: m.runs>0 ? points/m.runs : null, ptsPerRun: m.runs>0 ? points/m.runs : null,
@@ -249,7 +261,7 @@ function aggregate(f){
const COLS = [ const COLS = [
{k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true}, {k:"model", t:"model", l:true}, {k:"provider", t:"provider", l:true},
{k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)}, {k:"runs", t:"runs"}, {k:"minutes", t:"min", fmt:v=>v.toFixed(1)},
{k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"}, {k:"findings", t:"findings"}, {k:"confirmed", t:"real"}, {k:"solo", t:"solo"}, {k:"fp", t:"FP"}, {k:"ungraded", t:"ungr"},
{k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"}, {k:"confirmedPct", t:"real%", fmt:v=>v==null?"—":v.toFixed(0)+"%"},
{k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"}, {k:"fpPen", t:"fp pen", fmt:v=>v?v.toFixed(1):"0"},
{k:"points", t:"points (net)", fmt:v=>v.toFixed(0)}, {k:"points", t:"points (net)", fmt:v=>v.toFixed(0)},
@@ -286,6 +298,7 @@ function render(){
td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v); td.innerHTML = col.fmt ? col.fmt(v) : (v==null?"—":v);
if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good"); if ((col.k==="ptsPerMin" || col.k==="ptsPerRun" || col.k==="points") && v!=null) td.classList.add(v<0 ? "bad" : "good");
if (col.k==="fpPen" && v<0) td.classList.add("bad"); if (col.k==="fpPen" && v<0) td.classList.add("bad");
if (col.k==="solo" && v>0) td.classList.add("good");
if (col.k==="fp" && v>0) td.classList.add("warn"); if (col.k==="fp" && v>0) td.classList.add("warn");
tr.appendChild(td); tr.appendChild(td);
} }