ddcf42a3ce
SQLite-backed HTTP store for Gadfly review findings, per-review run timings, and human/Claude grades, with a points-free per-model scoreboard. Pure fact store: it computes no points or rankings (the dashboard maps severity->points client-side and retunes without re-scoring). Findings are content-addressed by location so cross-model reports collapse for consensus; one grade per finding, latest wins. Pure-Go SQLite (CGO-free) + Docker image CI + tests. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
133 lines
4.9 KiB
Go
133 lines
4.9 KiB
Go
package main
|
|
|
|
import (
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
func testStore(t *testing.T) *Store {
|
|
t.Helper()
|
|
s, err := Open(filepath.Join(t.TempDir(), "gadfly-reports.db"))
|
|
if err != nil {
|
|
t.Fatalf("open: %v", err)
|
|
}
|
|
t.Cleanup(func() { s.Close() })
|
|
return s
|
|
}
|
|
|
|
func i64(v int64) *int64 { return &v }
|
|
func intp(v int) *int { return &v }
|
|
|
|
// TestConsensusAndGrade: two models reporting the SAME location collapse to one
|
|
// finding with two reports; a single grade applies to both models' scoreboards.
|
|
func TestConsensusAndGrade(t *testing.T) {
|
|
s := testStore(t)
|
|
|
|
if err := s.AddRun(Run{RunID: "r-cloud", Repo: "steve/x", PR: 2, Model: "minimax", Provider: "ollama-cloud", Lenses: 3, DurationSecs: 300, InputTokens: i64(1000), OutputTokens: i64(500)}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := s.AddRun(Run{RunID: "r-m1", Repo: "steve/x", PR: 2, Model: "qwen3", Provider: "m1", Lenses: 3, DurationSecs: 1740}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Both models flag the same file:line under the same lens.
|
|
ids, err := s.AddReports([]ReportIn{
|
|
{Repo: "steve/x", PR: 2, Lens: "correctness", File: "run/executor.go", Line: 166, Title: "SetIteration never called", Model: "minimax", Provider: "ollama-cloud", RunID: "r-cloud", RawSeverity: "Blocking"},
|
|
{Repo: "steve/x", PR: 2, Lens: "correctness", File: "run/executor.go", Line: 166, Title: "iteration counter dead", Model: "qwen3", Provider: "m1", RunID: "r-m1", RawSeverity: "Blocking"},
|
|
})
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if ids[0] != ids[1] {
|
|
t.Fatalf("same location should collapse to one finding id, got %q and %q", ids[0], ids[1])
|
|
}
|
|
|
|
if err := s.AddGrade(Grade{FindingID: ids[0], IsReal: true, Severity: "high", Usefulness: intp(4), Grader: "claude"}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
board, err := s.Scoreboard()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
byModel := map[string]ModelStat{}
|
|
for _, m := range board {
|
|
byModel[m.Model] = m
|
|
}
|
|
for _, name := range []string{"minimax", "qwen3"} {
|
|
m := byModel[name]
|
|
if m.Findings != 1 || m.Confirmed != 1 || m.BySeverity["high"] != 1 {
|
|
t.Errorf("%s: findings=%d confirmed=%d high=%d, want 1/1/1", name, m.Findings, m.Confirmed, m.BySeverity["high"])
|
|
}
|
|
}
|
|
if got := byModel["minimax"].Minutes; got != 5 {
|
|
t.Errorf("minimax minutes = %v, want 5", got)
|
|
}
|
|
if got := byModel["qwen3"].Minutes; got != 29 {
|
|
t.Errorf("qwen3 minutes = %v, want 29", got)
|
|
}
|
|
if got := byModel["minimax"].InputTokens; got != 1000 {
|
|
t.Errorf("minimax input_tokens = %d, want 1000", got)
|
|
}
|
|
}
|
|
|
|
// TestLatestGradeWins: a re-grade supersedes the prior one everywhere.
|
|
func TestLatestGradeWins(t *testing.T) {
|
|
s := testStore(t)
|
|
if err := s.AddRun(Run{RunID: "r1", Repo: "r", PR: 1, Model: "m", Provider: "p", DurationSecs: 60}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
ids, err := s.AddReports([]ReportIn{{Repo: "r", PR: 1, Lens: "security", File: "a.go", Line: 5, Title: "x", Model: "m", Provider: "p", RunID: "r1"}})
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
id := ids[0]
|
|
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "critical"}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := s.AddGrade(Grade{FindingID: id, IsReal: false}); err != nil { // re-graded as a false positive
|
|
t.Fatal(err)
|
|
}
|
|
board, _ := s.Scoreboard()
|
|
m := board[0]
|
|
if m.Confirmed != 0 || m.FalsePositive != 1 || m.BySeverity["critical"] != 0 {
|
|
t.Errorf("after re-grade: confirmed=%d fp=%d critical=%d, want 0/1/0", m.Confirmed, m.FalsePositive, m.BySeverity["critical"])
|
|
}
|
|
}
|
|
|
|
// TestGradeValidation rejects bad severity / usefulness / unknown finding.
|
|
func TestGradeValidation(t *testing.T) {
|
|
s := testStore(t)
|
|
ids, _ := s.AddReports([]ReportIn{{Repo: "r", PR: 1, Lens: "perf", File: "a.go", Line: 1, Title: "t", Model: "m", Provider: "p", RunID: "r1"}})
|
|
id := ids[0]
|
|
|
|
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "huge"}); err == nil {
|
|
t.Error("expected error for invalid severity")
|
|
}
|
|
if err := s.AddGrade(Grade{FindingID: id, IsReal: true, Severity: "high", Usefulness: intp(9)}); err == nil {
|
|
t.Error("expected error for out-of-range usefulness")
|
|
}
|
|
if err := s.AddGrade(Grade{FindingID: "nope", IsReal: true, Severity: "high"}); err == nil {
|
|
t.Error("expected error for unknown finding")
|
|
}
|
|
// A false positive needs no severity.
|
|
if err := s.AddGrade(Grade{FindingID: id, IsReal: false}); err != nil {
|
|
t.Errorf("false positive without severity should be valid: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestFindingIDLocationKeyed: id depends on location, not wording; line matters.
|
|
func TestFindingIDLocationKeyed(t *testing.T) {
|
|
a := findingID("r", 1, "security", "a.go", 10)
|
|
sameWordingDiff := findingID("r", 1, "security", "a.go", 10) // any title — id ignores it
|
|
if a != sameWordingDiff {
|
|
t.Error("same location must yield same id regardless of wording")
|
|
}
|
|
if a == findingID("r", 1, "security", "a.go", 11) {
|
|
t.Error("different line must yield different id")
|
|
}
|
|
if a == findingID("r", 1, "correctness", "a.go", 10) {
|
|
t.Error("different lens must yield different id")
|
|
}
|
|
}
|