Files
executus/llmmeta/helper_test.go
T
steve b424261aca
executus CI / test (pull_request) Successful in 58s
Adversarial Review (Gadfly) / review (pull_request) Successful in 26m27s
executus CI / test (push) Successful in 1m2s
P1: model layer (convar->config inversion) + llmmeta
Lifts mort's pkg/logic/llms into executus/model, decoupled from mort:

- tiers.go: the tier resolver now reads a host-supplied config.Source under
  "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults,
  ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the
  resolution mechanism (cache, reasoning-suffix dialect, chain validation) is
  generic. No tier names hard-coded in the harness.
- sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into
  UsageSink / TraceSink seams + a model-owned Span, with nil-safe context
  attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser).
  Both sinks optional (nil = off) so a light host records nothing.
- lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf.
- call.go keeps GenerateWith[T] (instrumented structured output) — this is the
  structured-output primitive; no separate structured/ package.
- llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry
  + ledger). Its tests configure a minimal tier table via TestMain.

New tests cover the inversion: config overrides fallback, tier registration,
reasoning-suffix survival, nested-tier rejection, nil-sink no-ops.

Full module: go build/vet/test -race green; core go.sum still free of
gorm/redis/discordgo/sqlite.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:47:13 -04:00

283 lines
8.7 KiB
Go

package llmmeta
import (
"context"
"errors"
"strings"
"sync"
"testing"
llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// fakeStorage records every MetaCall handed to RecordMetaCall and
// makes them available to tests via the captured slice.
type fakeStorage struct {
mu sync.Mutex
calls []MetaCall
err error
}
func (f *fakeStorage) RecordMetaCall(_ context.Context, call MetaCall) error {
f.mu.Lock()
defer f.mu.Unlock()
f.calls = append(f.calls, call)
return f.err
}
func (f *fakeStorage) snapshot() []MetaCall {
f.mu.Lock()
defer f.mu.Unlock()
out := make([]MetaCall, len(f.calls))
copy(out, f.calls)
return out
}
// TestCall_TierNotAllowed: a tier not in the allowlist returns the
// rejection without recording a ledger row — the call did not happen.
func TestCall_TierNotAllowed(t *testing.T) {
store := &fakeStorage{}
convars := ConvarReaderFunc(func(_ context.Context) []string {
return []string{"fast"}
})
h := New(store, convars)
res, err := h.Call(context.Background(), CallSpec{
Tier: "thinking",
UserPrompt: "hello",
ToolName: "summarize",
})
if err != nil {
t.Fatalf("unexpected err: %v", err)
}
if res.Success {
t.Errorf("expected Success=false")
}
if res.ErrorKind != ErrorKindTierNotAllowed {
t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindTierNotAllowed)
}
if len(store.snapshot()) != 0 {
t.Errorf("expected NO ledger row for tier_not_allowed, got %d", len(store.snapshot()))
}
}
// TestCall_TierAllowedHappyText: a permitted tier yields a successful
// text call AND records a ledger row.
func TestCall_TierAllowedHappyText(t *testing.T) {
store := &fakeStorage{}
convars := ConvarReaderFunc(func(_ context.Context) []string {
return []string{"fast"}
})
h := New(store, convars)
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
return "summary text here", Tokens{InputTokens: 50, OutputTokens: 12}, nil
})
defer restore()
res, err := h.Call(context.Background(), CallSpec{
Tier: "fast",
UserPrompt: "summarise the following ...",
ToolName: "summarize",
ResponseFormat: "text",
RunID: "run-1",
SkillID: "sk-1",
})
if err != nil {
t.Fatalf("unexpected err: %v", err)
}
if !res.Success {
t.Errorf("expected Success=true; got ErrorKind=%q", res.ErrorKind)
}
if res.Text != "summary text here" {
t.Errorf("Text = %q, want %q", res.Text, "summary text here")
}
if res.InputTokens != 50 || res.OutputTokens != 12 {
t.Errorf("token counts wrong: in=%d out=%d", res.InputTokens, res.OutputTokens)
}
if got := len(store.snapshot()); got != 1 {
t.Fatalf("expected 1 ledger row, got %d", got)
}
row := store.snapshot()[0]
if !row.Success {
t.Errorf("ledger Success = false, want true")
}
if row.ToolName != "summarize" {
t.Errorf("ledger ToolName = %q", row.ToolName)
}
if row.RunID != "run-1" {
t.Errorf("ledger RunID = %q", row.RunID)
}
if row.InputTokens != 50 || row.OutputTokens != 12 {
t.Errorf("ledger token counts wrong: in=%d out=%d",
row.InputTokens, row.OutputTokens)
}
}
// TestCall_JSONFirstAttemptParses: JSON-format request, response is
// valid JSON on first try; result.Parsed populated.
func TestCall_JSONFirstAttemptParses(t *testing.T) {
store := &fakeStorage{}
h := New(store, nil)
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
return `{"foo":"bar","n":42}`, Tokens{InputTokens: 10, OutputTokens: 5}, nil
})
defer restore()
res, _ := h.Call(context.Background(), CallSpec{
UserPrompt: "extract entities",
ToolName: "extract_entities",
ResponseFormat: "json",
RetryOnMalformedJSON: true,
SkillID: "sk-2",
})
if !res.Success || res.ErrorKind != "" {
t.Fatalf("expected success, got %+v", res)
}
m, ok := res.Parsed.(map[string]any)
if !ok {
t.Fatalf("Parsed not a map: %T %v", res.Parsed, res.Parsed)
}
if m["foo"] != "bar" {
t.Errorf("Parsed[foo] = %v", m["foo"])
}
}
// TestCall_JSONRetryPath: first response is malformed JSON; second
// response (after stricter prompt) parses cleanly.
func TestCall_JSONRetryPath(t *testing.T) {
store := &fakeStorage{}
h := New(store, nil)
calls := 0
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, prompt string, _ []llm.Option) (string, Tokens, error) {
calls++
if calls == 1 {
return "Here is your JSON: {oh no I forgot to format it", Tokens{InputTokens: 8, OutputTokens: 12}, nil
}
// Verify stricter prompt prefix appeared on retry.
if !strings.Contains(prompt, "Return ONLY valid JSON") {
t.Errorf("retry prompt missing stricter prefix: %q", prompt)
}
return `{"key":"value"}`, Tokens{InputTokens: 14, OutputTokens: 6}, nil
})
defer restore()
res, _ := h.Call(context.Background(), CallSpec{
UserPrompt: "extract",
ToolName: "extract_entities",
ResponseFormat: "json",
RetryOnMalformedJSON: true,
})
if !res.Success || res.ErrorKind != "" {
t.Fatalf("expected success, got %+v", res)
}
if calls != 2 {
t.Errorf("expected 2 LLM calls, got %d", calls)
}
m, _ := res.Parsed.(map[string]any)
if m["key"] != "value" {
t.Errorf("Parsed = %v", res.Parsed)
}
// Token counts should reflect both attempts.
if res.InputTokens != 22 || res.OutputTokens != 18 {
t.Errorf("combined tokens wrong: in=%d out=%d", res.InputTokens, res.OutputTokens)
}
}
// TestCall_JSONRetryFailsTwice: second attempt also fails to parse.
// Surfaces ErrorKind=malformed_json AND keeps Success=true so the
// caller can fall back to result.Text.
func TestCall_JSONRetryFailsTwice(t *testing.T) {
store := &fakeStorage{}
h := New(store, nil)
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
return "still not JSON", Tokens{InputTokens: 10, OutputTokens: 4}, nil
})
defer restore()
res, _ := h.Call(context.Background(), CallSpec{
UserPrompt: "extract",
ToolName: "extract_entities",
ResponseFormat: "json",
RetryOnMalformedJSON: true,
})
if !res.Success {
t.Errorf("expected Success=true (fall-back-to-text), got Success=false")
}
if res.ErrorKind != ErrorKindMalformedJSON {
t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindMalformedJSON)
}
if res.Parsed != nil {
t.Errorf("Parsed = %v, want nil after failed retry", res.Parsed)
}
rows := store.snapshot()
if len(rows) != 1 {
t.Fatalf("expected 1 ledger row, got %d", len(rows))
}
if !rows[0].Success || rows[0].ErrorKind != ErrorKindMalformedJSON {
t.Errorf("ledger row mismatch: %+v", rows[0])
}
}
// TestCall_LLMUnavailable: transport error from the model.Generate
// call is surfaced as ErrorKind=llm_unavailable AND records a ledger
// row.
func TestCall_LLMUnavailable(t *testing.T) {
store := &fakeStorage{}
h := New(store, nil)
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
return "", Tokens{}, errors.New("network error")
})
defer restore()
res, _ := h.Call(context.Background(), CallSpec{
UserPrompt: "hi",
ToolName: "summarize",
})
if res.Success {
t.Errorf("expected Success=false")
}
if res.ErrorKind != ErrorKindLLMUnavailable {
t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindLLMUnavailable)
}
rows := store.snapshot()
if len(rows) != 1 {
t.Fatalf("expected 1 ledger row, got %d", len(rows))
}
}
// TestCall_EmptyUserPromptErrors: programmer-error guard.
func TestCall_EmptyUserPromptErrors(t *testing.T) {
h := New(&fakeStorage{}, nil)
_, err := h.Call(context.Background(), CallSpec{ToolName: "summarize"})
if err == nil {
t.Fatal("expected error for empty user_prompt")
}
}
// TestCall_JSONWithCodeFenceParses: tolerance for the first-attempt
// response wrapped in a ```json ... ``` fence. The retry path uses a
// stricter prompt; this test pins the first-attempt tolerance so
// callers don't waste a round-trip on a benign formatting wrapper.
func TestCall_JSONWithCodeFenceParses(t *testing.T) {
store := &fakeStorage{}
h := New(store, nil)
restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
return "```json\n{\"x\":1}\n```", Tokens{InputTokens: 5, OutputTokens: 4}, nil
})
defer restore()
res, _ := h.Call(context.Background(), CallSpec{
UserPrompt: "extract",
ToolName: "extract_entities",
ResponseFormat: "json",
RetryOnMalformedJSON: true,
})
if res.ErrorKind != "" {
t.Errorf("unexpected ErrorKind %q (fenced JSON should parse on first attempt)", res.ErrorKind)
}
m, _ := res.Parsed.(map[string]any)
if m["x"] != float64(1) {
t.Errorf("Parsed[x] = %v, want 1", m["x"])
}
}