Files
steve ddcf42a3ce
Build & push image / build-and-push (push) Successful in 1m13s
CI / test (push) Successful in 10m39s
feat: gadfly-reports — findings store + scoreboard daemon
SQLite-backed HTTP store for Gadfly review findings, per-review run timings, and human/Claude grades, with a points-free per-model scoreboard. Pure fact store: it computes no points or rankings (the dashboard maps severity->points client-side and retunes without re-scoring). Findings are content-addressed by location so cross-model reports collapse for consensus; one grade per finding, latest wins. Pure-Go SQLite (CGO-free) + Docker image CI + tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:55:24 -04:00

133 lines
4.9 KiB
Go

package main
import (
"path/filepath"
"testing"
)
func testStore(t *testing.T) *Store {
t.Helper()
s, err := Open(filepath.Join(t.TempDir(), "gadfly-reports.db"))
if err != nil {
t.Fatalf("open: %v", err)
}
t.Cleanup(func() { s.Close() })
return s
}
func i64(v int64) *int64 { return &v }
func intp(v int) *int { return &v }
// TestConsensusAndGrade: two models reporting the SAME location collapse to one
// finding with two reports; a single grade applies to both models' scoreboards.
func TestConsensusAndGrade(t *testing.T) {
s := testStore(t)
if err := s.AddRun(Run{RunID: "r-cloud", Repo: "steve/x", PR: 2, Model: "minimax", Provider: "ollama-cloud", Lenses: 3, DurationSecs: 300, InputTokens: i64(1000), OutputTokens: i64(500)}); err != nil {
t.Fatal(err)
}
if err := s.AddRun(Run{RunID: "r-m1", Repo: "steve/x", PR: 2, Model: "qwen3", Provider: "m1", Lenses: 3, DurationSecs: 1740}); err != nil {
t.Fatal(err)
}
// Both models flag the same file:line under the same lens.
ids, err := s.AddReports([]ReportIn{
{Repo: "steve/x", PR: 2, Lens: "correctness", File: "run/executor.go", Line: 166, Title: "SetIteration never called", Model: "minimax", Provider: "ollama-cloud", RunID: "r-cloud", RawSeverity: "Blocking"},
{Repo: "steve/x", PR: 2, Lens: "correctness", File: "run/executor.go", Line: 166, Title: "iteration counter dead", Model: "qwen3", Provider: "m1", RunID: "r-m1", RawSeverity: "Blocking"},
})
if err != nil {
t.Fatal(err)
}
if ids[0] != ids[1] {
t.Fatalf("same location should collapse to one finding id, got %q and %q", ids[0], ids[1])
}
if err := s.AddGrade(Grade{FindingID: ids[0], IsReal: true, Severity: "high", Usefulness: intp(4), Grader: "claude"}); err != nil {
t.Fatal(err)
}
board, err := s.Scoreboard()
if err != nil {
t.Fatal(err)
}
byModel := map[string]ModelStat{}
for _, m := range board {
byModel[m.Model] = m
}
for _, name := range []string{"minimax", "qwen3"} {
m := byModel[name]
if m.Findings != 1 || m.Confirmed != 1 || m.BySeverity["high"] != 1 {
t.Errorf("%s: findings=%d confirmed=%d high=%d, want 1/1/1", name, m.Findings, m.Confirmed, m.BySeverity["high"])
}
}
if got := byModel["minimax"].Minutes; got != 5 {
t.Errorf("minimax minutes = %v, want 5", got)
}
if got := byModel["qwen3"].Minutes; got != 29 {
t.Errorf("qwen3 minutes = %v, want 29", got)
}
if got := byModel["minimax"].InputTokens; got != 1000 {
t.Errorf("minimax input_tokens = %d, want 1000", got)
}
}
// TestLatestGradeWins: a re-grade supersedes the prior one everywhere.
func TestLatestGradeWins(t *testing.T) {
s := testStore(t)
if err := s.AddRun(Run{RunID: "r1", Repo: "r", PR: 1, Model: "m", Provider: "p", DurationSecs: 60}); err != nil {
t.Fatal(err)
}
ids, err := s.AddReports([]ReportIn{{Repo: "r", PR: 1, Lens: "security", File: "a.go", Line: 5, Title: "x", Model: "m", Provider: "p", RunID: "r1"}})
if err != nil {
t.Fatal(err)
}
id := ids[0]
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "critical"}); err != nil {
t.Fatal(err)
}
if err := s.AddGrade(Grade{FindingID: id, IsReal: false}); err != nil { // re-graded as a false positive
t.Fatal(err)
}
board, _ := s.Scoreboard()
m := board[0]
if m.Confirmed != 0 || m.FalsePositive != 1 || m.BySeverity["critical"] != 0 {
t.Errorf("after re-grade: confirmed=%d fp=%d critical=%d, want 0/1/0", m.Confirmed, m.FalsePositive, m.BySeverity["critical"])
}
}
// TestGradeValidation rejects bad severity / usefulness / unknown finding.
func TestGradeValidation(t *testing.T) {
s := testStore(t)
ids, _ := s.AddReports([]ReportIn{{Repo: "r", PR: 1, Lens: "perf", File: "a.go", Line: 1, Title: "t", Model: "m", Provider: "p", RunID: "r1"}})
id := ids[0]
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "huge"}); err == nil {
t.Error("expected error for invalid severity")
}
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "high", Usefulness: intp(9)}); err == nil {
t.Error("expected error for out-of-range usefulness")
}
if err := s.AddGrade(Grade{FindingID: "nope", IsReal: true, Severity: "high"}); err == nil {
t.Error("expected error for unknown finding")
}
// A false positive needs no severity.
if err := s.AddGrade(Grade{FindingID: id, IsReal: false}); err != nil {
t.Errorf("false positive without severity should be valid: %v", err)
}
}
// TestFindingIDLocationKeyed: id depends on location, not wording; line matters.
func TestFindingIDLocationKeyed(t *testing.T) {
a := findingID("r", 1, "security", "a.go", 10)
sameWordingDiff := findingID("r", 1, "security", "a.go", 10) // any title — id ignores it
if a != sameWordingDiff {
t.Error("same location must yield same id regardless of wording")
}
if a == findingID("r", 1, "security", "a.go", 11) {
t.Error("different line must yield different id")
}
if a == findingID("r", 1, "correctness", "a.go", 10) {
t.Error("different lens must yield different id")
}
}