Files
executus/model/llms.go
T
steve b424261aca
executus CI / test (pull_request) Successful in 58s
Adversarial Review (Gadfly) / review (pull_request) Successful in 26m27s
executus CI / test (push) Successful in 1m2s
P1: model layer (convar->config inversion) + llmmeta
Lifts mort's pkg/logic/llms into executus/model, decoupled from mort:

- tiers.go: the tier resolver now reads a host-supplied config.Source under
  "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults,
  ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the
  resolution mechanism (cache, reasoning-suffix dialect, chain validation) is
  generic. No tier names hard-coded in the harness.
- sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into
  UsageSink / TraceSink seams + a model-owned Span, with nil-safe context
  attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser).
  Both sinks optional (nil = off) so a light host records nothing.
- lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf.
- call.go keeps GenerateWith[T] (instrumented structured output) — this is the
  structured-output primitive; no separate structured/ package.
- llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry
  + ledger). Its tests configure a minimal tier table via TestMain.

New tests cover the inversion: config overrides fallback, tier registration,
reasoning-suffix survival, nested-tier rejection, nil-sink no-ops.

Full module: go build/vet/test -race green; core go.sum still free of
gorm/redis/discordgo/sqlite.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:47:13 -04:00

478 lines
17 KiB
Go

// Package model is executus's config-driven model-access layer over majordomo: it owns the
// package-level *majordomo.Registry (providers with mort's env keys,
// OpenAI-compat presets, lane-aware decoration, the DB-backed tier
// resolver, legacy shortcut aliases, the foreman timeout decorator, and
// failover/health wiring), plus the mort-facing call helpers
// (ParseModelRequest / ParseModelForContext / GenerateWith /
// CallAndExecute / SimpleCall) and usage/trace recording.
//
// The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience:
// majordomo treats model ids as verbatim, so this package strips the
// suffix from specs and tier values and re-applies it per request via
// llm.WithReasoningEffort on a wrapping Model.
package model
import (
"context"
"fmt"
"os"
"strings"
"sync"
"time"
majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
"gitea.stevedudenhoeffer.com/steve/majordomo/health"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/google"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai"
)
// Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink
// install the host seams, and ParseModelForContext stamps the model name on
// the context (via WithModel) for attribution.
// ---------------------------------------------------------------------------
// Package registry
// ---------------------------------------------------------------------------
// buildConfig carries the knobs Wire feeds into buildRegistry. The zero
// value yields a lane-less registry with majordomo's default failover
// behavior — the bootstrap state tests and pre-Wire code paths run on.
type buildConfig struct {
lanes LaneRegistry
// maxRetries maps the llms.failover.max_retries convar onto
// ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1).
maxRetries int
// cooldown maps the llms.failover.cooldown_seconds convar onto
// health.Config.BaseCooldown. <= 0 keeps the mort default (300s).
// Note majordomo grows the cooldown exponentially from this base;
// MaxCooldown is set to max(cooldown, 5m) so the operator dial
// dominates (a 10m base never gets capped below itself).
cooldown time.Duration
// observer receives one event per failover decision (failed attempt,
// bench, benched-skip). Typically failoverlog.NewObserver(...).
observer func(majordomo.FailoverEvent)
}
// defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds
// convar default (300s).
const defaultFailoverCooldown = 300 * time.Second
var (
registryMu sync.RWMutex
registry = buildRegistry(buildConfig{})
)
// Registry returns the current package-level majordomo registry. Most
// callers should use ParseModelRequest / ParseModelForContext instead;
// the registry itself is exposed for admin surfaces (health/bench) and
// for tests that need to substitute providers.
func Registry() *majordomo.Registry {
registryMu.RLock()
defer registryMu.RUnlock()
return registry
}
// Health returns the health tracker of the current registry — the live
// source of truth for benched models. Used by the `.failover` commands
// and the failover web UI (see ListBenched/BenchModel/UnbenchModel for
// the mort-flavored facade).
func Health() *health.Tracker {
return Registry().Health()
}
// setRegistry swaps the package registry. Bench/backoff state of the old
// registry is discarded — Wire is a boot-time operation.
func setRegistry(r *majordomo.Registry) {
registryMu.Lock()
defer registryMu.Unlock()
registry = r
}
// buildRegistry constructs a fully-wired majordomo registry:
//
// - health/chain config from the failover convars (via cfg),
// - mort's providers under their nonstandard env keys (OPENAI_KEY,
// GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated,
// - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq),
// - scheme factories for LLM_* env DSNs re-registered so DSN-defined
// providers (m1, arbitrary foreman targets) are lane-decorated too,
// with foreman additionally getting the 30-minute model timeout,
// - the legacy shortcut aliases, and
// - the delegating tier resolver (reads defaultResolver at Resolve
// time, so Init() can swap in the DB-backed resolver later).
func buildRegistry(cfg buildConfig) *majordomo.Registry {
cooldown := cfg.cooldown
if cooldown <= 0 {
cooldown = defaultFailoverCooldown
}
maxCooldown := cooldown
if maxCooldown < 5*time.Minute {
maxCooldown = 5 * time.Minute
}
r := majordomo.New(
// Env DSNs are loaded manually below, AFTER the scheme factories
// are overridden — New()'s eager scan would otherwise build
// LLM_*-defined providers with the stock (un-decorated) factories.
majordomo.WithoutEnvProviders(),
majordomo.WithHealthConfig(health.Config{
BaseCooldown: cooldown,
MaxCooldown: maxCooldown,
}),
majordomo.WithChainConfig(majordomo.ChainConfig{
TransientRetries: cfg.maxRetries,
// legacy gollm failed over on request-specific errors (400/413/422)
// without benching; majordomo fails fast on permanent errors by
// default. AdvanceOnPermanent preserves the availability-first
// behavior mort's executors rely on.
AdvanceOnPermanent: true,
Observer: cfg.observer,
}),
)
wrap := func(p llm.Provider) llm.Provider {
return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout)
}
// Core providers with mort's env keys.
r.RegisterProvider(wrap(openai.New(
openai.WithAPIKey(os.Getenv("OPENAI_KEY")),
)))
r.RegisterProvider(wrap(anthropic.New(
anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")),
)))
r.RegisterProvider(wrap(google.New(
google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")),
)))
r.RegisterProvider(wrap(localOllamaProvider()))
// ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the
// provider still registers and errors clearly at call time (parity
// with the previous behavior).
r.RegisterProvider(wrap(ollama.Cloud()))
// OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults.
for _, preset := range []struct {
name, baseURL, envKey string
}{
{"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"},
{"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"},
{"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot
{"xai", "https://api.x.ai/v1", "XAI_API_KEY"},
{"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai
{"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"},
} {
r.RegisterProvider(wrap(openai.New(
openai.WithName(preset.name),
openai.WithBaseURL(preset.baseURL),
openai.WithAPIKey(os.Getenv(preset.envKey)),
)))
}
// Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined
// providers go through the lane decorator like the built-ins.
//
// foreman targets are slow local LLMs (large model loads, queued
// behind other requests), so their models additionally get a hard
// 30-minute timeout and a matching lane execution backstop — the
// default 5-minute lane backstop would strangle them.
r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) {
p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name))
return wrapProviderForLane(
withModelTimeout(p, foremanModelTimeout),
cfg.lanes,
foremanLaneExecTimeout,
), nil
})
laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory {
return func(name string, dsn majordomo.DSN) (llm.Provider, error) {
p, err := factory(name, dsn)
if err != nil {
return nil, err
}
return wrap(p), nil
}
}
ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
return ollama.New(
ollama.WithName(name),
ollama.WithBaseURL(dsn.BaseURL()),
ollama.WithToken(dsn.Token),
), nil
})
r.RegisterScheme("ollama", ollamaScheme)
r.RegisterScheme("ollama-cloud", ollamaScheme)
r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
return openai.New(
openai.WithName(name),
openai.WithBaseURL(dsn.BaseURL()),
openai.WithAPIKey(dsn.Token),
), nil
}))
r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
return anthropic.New(
anthropic.WithName(name),
anthropic.WithBaseURL(dsn.BaseURL()),
anthropic.WithAPIKey(dsn.Token),
), nil
}))
googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
return google.New(
google.WithName(name),
google.WithBaseURL(dsn.BaseURL()),
google.WithAPIKey(dsn.Token),
), nil
})
r.RegisterScheme("google", googleScheme)
r.RegisterScheme("gemini", googleScheme)
// Eager LLM_* env scan, now with the decorated scheme factories in
// place. Malformed entries are recorded per-name and surface on use.
env := make(map[string]string)
for _, kv := range os.Environ() {
if k, v, ok := strings.Cut(kv, "="); ok {
env[k] = v
}
}
_ = r.LoadEnv(env)
// Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the
// historical table; kept in sync with legacyAliasSpecs below.
for name, spec := range legacyAliasSpecs {
r.RegisterAlias(name, spec)
}
// Tier resolver: a delegating closure so Init() and test helpers can
// swap defaultResolver without rebuilding the registry. The resolver
// returns specs with the legacy reasoning suffixes already stripped
// (per chain element); the tier's default reasoning level is applied
// by ParseModelRequest, not here.
r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) {
res := defaultResolver
if res == nil {
return "", false
}
spec, _, ok := res.Resolve(name)
return spec, ok
}))
return r
}
// localOllamaProvider builds the local Ollama provider, honoring
// OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local
// itself honors OLLAMA_HOST).
func localOllamaProvider() llm.Provider {
if url := os.Getenv("OLLAMA_BASE_URL"); url != "" {
return ollama.Local(ollama.WithBaseURL(url))
}
return ollama.Local()
}
// ---------------------------------------------------------------------------
// Spec parsing
// ---------------------------------------------------------------------------
// ParseModelRequest resolves a model request string to a ready-to-use Model.
// It handles, in order:
//
// - empty spec → tier "fast"
// - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain
// element (ollama tags like ":30b" or ":cloud" are preserved); the
// level is applied to every call via llm.WithReasoningEffort
// - tier aliases (DB-backed convars; a tier value's own suffix becomes
// the default level when the caller didn't supply one)
// - legacy shortcut aliases (sonnet, haiku, opus, ...)
// - provider/model lookup and LLM_* env-DSN fallback (majordomo)
// - comma-separated failover chains with health-tracked bench/backoff
//
// The returned Model is instrumented: token usage from every successful
// Generate is recorded to the package usage recorder automatically. Do
// NOT additionally call RecordUsage on responses from a parsed model.
func ParseModelRequest(spec string) (majordomo.Model, error) {
spec = strings.TrimSpace(spec)
if spec == "" {
spec = "fast"
}
clean, level := splitReasoningSpec(spec)
// Tier default reasoning: when the (suffix-free) spec is exactly a
// tier name and the caller didn't ask for a level, the tier value's
// own suffix (e.g. "anthropic/claude-opus-4-6:high") applies.
if level == "" && defaultResolver != nil {
if _, tierLevel, ok := defaultResolver.Resolve(clean); ok {
level = tierLevel
}
}
m, err := Registry().Parse(clean)
if err != nil {
return nil, fmt.Errorf("model %q: %w", spec, err)
}
if level != "" {
m = &reasoningModel{inner: m, level: level}
}
return &instrumentedModel{inner: m}, nil
}
// ParseModelForContext combines ParseModelRequest with llmusage.WithModel so
// that the resolved model name is recorded in the context for usage tracking.
// Prefer this over bare ParseModelRequest in all new code.
func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) {
model, err := ParseModelRequest(req)
if err != nil {
return ctx, nil, err
}
ctx = WithModel(ctx, ResolveModelName(req))
return ctx, model, nil
}
// reasoningModel applies a default reasoning effort to every request that
// doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix
// dialect resolves to this wrapper because majordomo treats model ids as
// verbatim (no suffix stripping).
type reasoningModel struct {
inner llm.Model
level string
}
func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
req = req.Apply(opts...)
if req.ReasoningEffort == "" {
req.ReasoningEffort = m.level
}
return m.inner.Generate(ctx, req)
}
func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
req = req.Apply(opts...)
if req.ReasoningEffort == "" {
req.ReasoningEffort = m.level
}
return m.inner.Stream(ctx, req)
}
func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
// ---------------------------------------------------------------------------
// Reasoning-suffix dialect
// ---------------------------------------------------------------------------
// reasoningLevels is the set of recognized legacy suffix values.
var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true}
// splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off
// a single model request string. Returns the input unchanged and "" when no
// recognized level is present, so non-reasoning suffixes (ollama tags like
// ":30b" or ":q4_K_M", date stamps) flow through untouched.
func splitReasoning(s string) (string, string) {
idx := strings.LastIndex(s, ":")
if idx < 0 {
return s, ""
}
if lvl := s[idx+1:]; reasoningLevels[lvl] {
return s[:idx], lvl
}
return s, ""
}
// splitReasoningSpec strips the legacy reasoning suffix from every element
// of a (possibly comma-separated) spec. The returned level is the first
// non-empty per-element level — majordomo chains carry one request-level
// reasoning effort, not one per target, so the head element's preference
// wins. Elements without a suffix are unchanged.
func splitReasoningSpec(spec string) (string, string) {
if !strings.Contains(spec, ",") {
return splitReasoning(strings.TrimSpace(spec))
}
parts := strings.Split(spec, ",")
level := ""
for i, p := range parts {
s, l := splitReasoning(strings.TrimSpace(p))
parts[i] = s
if level == "" {
level = l
}
}
return strings.Join(parts, ","), level
}
// ---------------------------------------------------------------------------
// Usage-attribution name resolution
// ---------------------------------------------------------------------------
// ResolveModelName returns the model portion of a request string, stripping
// any reasoning suffix and resolving tier aliases. The result is used for
// usage attribution (keyed on model name, not provider or reasoning level).
func ResolveModelName(req string) string {
// Strip any reasoning-level suffix before resolving — the level is a
// per-request setting, not part of the model identity.
req, _ = splitReasoning(req)
// Tier expansion: when the request is a tier alias, fold it through the
// resolver and return the model portion of its current convar value. The
// empty string is treated as "fast" for compatibility with callers that
// pre-resolution defaulted to fast.
if defaultResolver != nil {
key := req
if key == "" {
key = "fast"
}
if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" {
// A tier may resolve to a comma-separated failover chain. Attribute
// usage to the first (preferred) entry's model name rather than the
// whole chain string.
if i := strings.IndexByte(spec, ','); i >= 0 {
spec = strings.TrimSpace(spec[:i])
}
if idx := strings.Index(spec, "/"); idx >= 0 {
return spec[idx+1:]
}
return spec
}
}
// For non-tier requests, return the model portion after the slash.
// Static aliases are NOT expanded here beyond the legacy table below:
// callers that went through ParseModelRequest already carry the
// concrete spec.
if idx := strings.Index(req, "/"); idx >= 0 {
return req[idx+1:]
}
// Legacy shortcut fallback: callers that pass bare names like "sonnet"
// to ResolveModelName (without going through ParseModelRequest) still
// need the concrete model name for usage keys.
if spec, ok := legacyAliasSpecs[req]; ok {
if idx := strings.Index(spec, "/"); idx >= 0 {
return spec[idx+1:]
}
return spec
}
return req
}
// legacyAliasSpecs maps legacy shortcut names to their full provider/model
// spec. Registered with the registry as static aliases AND consulted by
// ResolveModelName for bare-name usage attribution.
var legacyAliasSpecs = map[string]string{
"openai": "openai/gpt-4o-mini",
"gpt-4": "openai/gpt-4",
"gpt-4o": "openai/gpt-4o",
"gpt-4o-mini": "openai/gpt-4o-mini",
"sonnet": "anthropic/claude-sonnet-4-6",
"sonnet-4.5": "anthropic/claude-sonnet-4-5-20250929",
"haiku": "anthropic/claude-haiku-4-5-20251001",
"opus": "anthropic/claude-opus-4-6",
"gemini": "google/gemini-2.0-flash",
"gemini-flash": "google/gemini-2.0-flash",
"gemini-pro": "google/gemini-2.0-pro",
}