b424261aca
Lifts mort's pkg/logic/llms into executus/model, decoupled from mort: - tiers.go: the tier resolver now reads a host-supplied config.Source under "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults, ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the resolution mechanism (cache, reasoning-suffix dialect, chain validation) is generic. No tier names hard-coded in the harness. - sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into UsageSink / TraceSink seams + a model-owned Span, with nil-safe context attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser). Both sinks optional (nil = off) so a light host records nothing. - lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf. - call.go keeps GenerateWith[T] (instrumented structured output) — this is the structured-output primitive; no separate structured/ package. - llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry + ledger). Its tests configure a minimal tier table via TestMain. New tests cover the inversion: config overrides fallback, tier registration, reasoning-suffix survival, nested-tier rejection, nil-sink no-ops. Full module: go build/vet/test -race green; core go.sum still free of gorm/redis/discordgo/sqlite. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
478 lines
17 KiB
Go
478 lines
17 KiB
Go
// Package model is executus's config-driven model-access layer over majordomo: it owns the
|
|
// package-level *majordomo.Registry (providers with mort's env keys,
|
|
// OpenAI-compat presets, lane-aware decoration, the DB-backed tier
|
|
// resolver, legacy shortcut aliases, the foreman timeout decorator, and
|
|
// failover/health wiring), plus the mort-facing call helpers
|
|
// (ParseModelRequest / ParseModelForContext / GenerateWith /
|
|
// CallAndExecute / SimpleCall) and usage/trace recording.
|
|
//
|
|
// The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience:
|
|
// majordomo treats model ids as verbatim, so this package strips the
|
|
// suffix from specs and tier values and re-applies it per request via
|
|
// llm.WithReasoningEffort on a wrapping Model.
|
|
package model
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/health"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/google"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai"
|
|
)
|
|
|
|
// Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink
|
|
// install the host seams, and ParseModelForContext stamps the model name on
|
|
// the context (via WithModel) for attribution.
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Package registry
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// buildConfig carries the knobs Wire feeds into buildRegistry. The zero
|
|
// value yields a lane-less registry with majordomo's default failover
|
|
// behavior — the bootstrap state tests and pre-Wire code paths run on.
|
|
type buildConfig struct {
|
|
lanes LaneRegistry
|
|
|
|
// maxRetries maps the llms.failover.max_retries convar onto
|
|
// ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1).
|
|
maxRetries int
|
|
|
|
// cooldown maps the llms.failover.cooldown_seconds convar onto
|
|
// health.Config.BaseCooldown. <= 0 keeps the mort default (300s).
|
|
// Note majordomo grows the cooldown exponentially from this base;
|
|
// MaxCooldown is set to max(cooldown, 5m) so the operator dial
|
|
// dominates (a 10m base never gets capped below itself).
|
|
cooldown time.Duration
|
|
|
|
// observer receives one event per failover decision (failed attempt,
|
|
// bench, benched-skip). Typically failoverlog.NewObserver(...).
|
|
observer func(majordomo.FailoverEvent)
|
|
}
|
|
|
|
// defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds
|
|
// convar default (300s).
|
|
const defaultFailoverCooldown = 300 * time.Second
|
|
|
|
var (
|
|
registryMu sync.RWMutex
|
|
registry = buildRegistry(buildConfig{})
|
|
)
|
|
|
|
// Registry returns the current package-level majordomo registry. Most
|
|
// callers should use ParseModelRequest / ParseModelForContext instead;
|
|
// the registry itself is exposed for admin surfaces (health/bench) and
|
|
// for tests that need to substitute providers.
|
|
func Registry() *majordomo.Registry {
|
|
registryMu.RLock()
|
|
defer registryMu.RUnlock()
|
|
return registry
|
|
}
|
|
|
|
// Health returns the health tracker of the current registry — the live
|
|
// source of truth for benched models. Used by the `.failover` commands
|
|
// and the failover web UI (see ListBenched/BenchModel/UnbenchModel for
|
|
// the mort-flavored facade).
|
|
func Health() *health.Tracker {
|
|
return Registry().Health()
|
|
}
|
|
|
|
// setRegistry swaps the package registry. Bench/backoff state of the old
|
|
// registry is discarded — Wire is a boot-time operation.
|
|
func setRegistry(r *majordomo.Registry) {
|
|
registryMu.Lock()
|
|
defer registryMu.Unlock()
|
|
registry = r
|
|
}
|
|
|
|
// buildRegistry constructs a fully-wired majordomo registry:
|
|
//
|
|
// - health/chain config from the failover convars (via cfg),
|
|
// - mort's providers under their nonstandard env keys (OPENAI_KEY,
|
|
// GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated,
|
|
// - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq),
|
|
// - scheme factories for LLM_* env DSNs re-registered so DSN-defined
|
|
// providers (m1, arbitrary foreman targets) are lane-decorated too,
|
|
// with foreman additionally getting the 30-minute model timeout,
|
|
// - the legacy shortcut aliases, and
|
|
// - the delegating tier resolver (reads defaultResolver at Resolve
|
|
// time, so Init() can swap in the DB-backed resolver later).
|
|
func buildRegistry(cfg buildConfig) *majordomo.Registry {
|
|
cooldown := cfg.cooldown
|
|
if cooldown <= 0 {
|
|
cooldown = defaultFailoverCooldown
|
|
}
|
|
maxCooldown := cooldown
|
|
if maxCooldown < 5*time.Minute {
|
|
maxCooldown = 5 * time.Minute
|
|
}
|
|
|
|
r := majordomo.New(
|
|
// Env DSNs are loaded manually below, AFTER the scheme factories
|
|
// are overridden — New()'s eager scan would otherwise build
|
|
// LLM_*-defined providers with the stock (un-decorated) factories.
|
|
majordomo.WithoutEnvProviders(),
|
|
majordomo.WithHealthConfig(health.Config{
|
|
BaseCooldown: cooldown,
|
|
MaxCooldown: maxCooldown,
|
|
}),
|
|
majordomo.WithChainConfig(majordomo.ChainConfig{
|
|
TransientRetries: cfg.maxRetries,
|
|
// legacy gollm failed over on request-specific errors (400/413/422)
|
|
// without benching; majordomo fails fast on permanent errors by
|
|
// default. AdvanceOnPermanent preserves the availability-first
|
|
// behavior mort's executors rely on.
|
|
AdvanceOnPermanent: true,
|
|
Observer: cfg.observer,
|
|
}),
|
|
)
|
|
|
|
wrap := func(p llm.Provider) llm.Provider {
|
|
return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout)
|
|
}
|
|
|
|
// Core providers with mort's env keys.
|
|
r.RegisterProvider(wrap(openai.New(
|
|
openai.WithAPIKey(os.Getenv("OPENAI_KEY")),
|
|
)))
|
|
r.RegisterProvider(wrap(anthropic.New(
|
|
anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")),
|
|
)))
|
|
r.RegisterProvider(wrap(google.New(
|
|
google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")),
|
|
)))
|
|
r.RegisterProvider(wrap(localOllamaProvider()))
|
|
// ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the
|
|
// provider still registers and errors clearly at call time (parity
|
|
// with the previous behavior).
|
|
r.RegisterProvider(wrap(ollama.Cloud()))
|
|
|
|
// OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults.
|
|
for _, preset := range []struct {
|
|
name, baseURL, envKey string
|
|
}{
|
|
{"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"},
|
|
{"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"},
|
|
{"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot
|
|
{"xai", "https://api.x.ai/v1", "XAI_API_KEY"},
|
|
{"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai
|
|
{"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"},
|
|
} {
|
|
r.RegisterProvider(wrap(openai.New(
|
|
openai.WithName(preset.name),
|
|
openai.WithBaseURL(preset.baseURL),
|
|
openai.WithAPIKey(os.Getenv(preset.envKey)),
|
|
)))
|
|
}
|
|
|
|
// Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined
|
|
// providers go through the lane decorator like the built-ins.
|
|
//
|
|
// foreman targets are slow local LLMs (large model loads, queued
|
|
// behind other requests), so their models additionally get a hard
|
|
// 30-minute timeout and a matching lane execution backstop — the
|
|
// default 5-minute lane backstop would strangle them.
|
|
r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name))
|
|
return wrapProviderForLane(
|
|
withModelTimeout(p, foremanModelTimeout),
|
|
cfg.lanes,
|
|
foremanLaneExecTimeout,
|
|
), nil
|
|
})
|
|
laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory {
|
|
return func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
p, err := factory(name, dsn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return wrap(p), nil
|
|
}
|
|
}
|
|
ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
return ollama.New(
|
|
ollama.WithName(name),
|
|
ollama.WithBaseURL(dsn.BaseURL()),
|
|
ollama.WithToken(dsn.Token),
|
|
), nil
|
|
})
|
|
r.RegisterScheme("ollama", ollamaScheme)
|
|
r.RegisterScheme("ollama-cloud", ollamaScheme)
|
|
r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
return openai.New(
|
|
openai.WithName(name),
|
|
openai.WithBaseURL(dsn.BaseURL()),
|
|
openai.WithAPIKey(dsn.Token),
|
|
), nil
|
|
}))
|
|
r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
return anthropic.New(
|
|
anthropic.WithName(name),
|
|
anthropic.WithBaseURL(dsn.BaseURL()),
|
|
anthropic.WithAPIKey(dsn.Token),
|
|
), nil
|
|
}))
|
|
googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
|
|
return google.New(
|
|
google.WithName(name),
|
|
google.WithBaseURL(dsn.BaseURL()),
|
|
google.WithAPIKey(dsn.Token),
|
|
), nil
|
|
})
|
|
r.RegisterScheme("google", googleScheme)
|
|
r.RegisterScheme("gemini", googleScheme)
|
|
|
|
// Eager LLM_* env scan, now with the decorated scheme factories in
|
|
// place. Malformed entries are recorded per-name and surface on use.
|
|
env := make(map[string]string)
|
|
for _, kv := range os.Environ() {
|
|
if k, v, ok := strings.Cut(kv, "="); ok {
|
|
env[k] = v
|
|
}
|
|
}
|
|
_ = r.LoadEnv(env)
|
|
|
|
// Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the
|
|
// historical table; kept in sync with legacyAliasSpecs below.
|
|
for name, spec := range legacyAliasSpecs {
|
|
r.RegisterAlias(name, spec)
|
|
}
|
|
|
|
// Tier resolver: a delegating closure so Init() and test helpers can
|
|
// swap defaultResolver without rebuilding the registry. The resolver
|
|
// returns specs with the legacy reasoning suffixes already stripped
|
|
// (per chain element); the tier's default reasoning level is applied
|
|
// by ParseModelRequest, not here.
|
|
r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) {
|
|
res := defaultResolver
|
|
if res == nil {
|
|
return "", false
|
|
}
|
|
spec, _, ok := res.Resolve(name)
|
|
return spec, ok
|
|
}))
|
|
|
|
return r
|
|
}
|
|
|
|
// localOllamaProvider builds the local Ollama provider, honoring
|
|
// OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local
|
|
// itself honors OLLAMA_HOST).
|
|
func localOllamaProvider() llm.Provider {
|
|
if url := os.Getenv("OLLAMA_BASE_URL"); url != "" {
|
|
return ollama.Local(ollama.WithBaseURL(url))
|
|
}
|
|
return ollama.Local()
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Spec parsing
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ParseModelRequest resolves a model request string to a ready-to-use Model.
|
|
// It handles, in order:
|
|
//
|
|
// - empty spec → tier "fast"
|
|
// - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain
|
|
// element (ollama tags like ":30b" or ":cloud" are preserved); the
|
|
// level is applied to every call via llm.WithReasoningEffort
|
|
// - tier aliases (DB-backed convars; a tier value's own suffix becomes
|
|
// the default level when the caller didn't supply one)
|
|
// - legacy shortcut aliases (sonnet, haiku, opus, ...)
|
|
// - provider/model lookup and LLM_* env-DSN fallback (majordomo)
|
|
// - comma-separated failover chains with health-tracked bench/backoff
|
|
//
|
|
// The returned Model is instrumented: token usage from every successful
|
|
// Generate is recorded to the package usage recorder automatically. Do
|
|
// NOT additionally call RecordUsage on responses from a parsed model.
|
|
func ParseModelRequest(spec string) (majordomo.Model, error) {
|
|
spec = strings.TrimSpace(spec)
|
|
if spec == "" {
|
|
spec = "fast"
|
|
}
|
|
|
|
clean, level := splitReasoningSpec(spec)
|
|
|
|
// Tier default reasoning: when the (suffix-free) spec is exactly a
|
|
// tier name and the caller didn't ask for a level, the tier value's
|
|
// own suffix (e.g. "anthropic/claude-opus-4-6:high") applies.
|
|
if level == "" && defaultResolver != nil {
|
|
if _, tierLevel, ok := defaultResolver.Resolve(clean); ok {
|
|
level = tierLevel
|
|
}
|
|
}
|
|
|
|
m, err := Registry().Parse(clean)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("model %q: %w", spec, err)
|
|
}
|
|
if level != "" {
|
|
m = &reasoningModel{inner: m, level: level}
|
|
}
|
|
return &instrumentedModel{inner: m}, nil
|
|
}
|
|
|
|
// ParseModelForContext combines ParseModelRequest with llmusage.WithModel so
|
|
// that the resolved model name is recorded in the context for usage tracking.
|
|
// Prefer this over bare ParseModelRequest in all new code.
|
|
func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) {
|
|
model, err := ParseModelRequest(req)
|
|
if err != nil {
|
|
return ctx, nil, err
|
|
}
|
|
ctx = WithModel(ctx, ResolveModelName(req))
|
|
return ctx, model, nil
|
|
}
|
|
|
|
// reasoningModel applies a default reasoning effort to every request that
|
|
// doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix
|
|
// dialect resolves to this wrapper because majordomo treats model ids as
|
|
// verbatim (no suffix stripping).
|
|
type reasoningModel struct {
|
|
inner llm.Model
|
|
level string
|
|
}
|
|
|
|
func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
|
|
req = req.Apply(opts...)
|
|
if req.ReasoningEffort == "" {
|
|
req.ReasoningEffort = m.level
|
|
}
|
|
return m.inner.Generate(ctx, req)
|
|
}
|
|
|
|
func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
|
|
req = req.Apply(opts...)
|
|
if req.ReasoningEffort == "" {
|
|
req.ReasoningEffort = m.level
|
|
}
|
|
return m.inner.Stream(ctx, req)
|
|
}
|
|
|
|
func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Reasoning-suffix dialect
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// reasoningLevels is the set of recognized legacy suffix values.
|
|
var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true}
|
|
|
|
// splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off
|
|
// a single model request string. Returns the input unchanged and "" when no
|
|
// recognized level is present, so non-reasoning suffixes (ollama tags like
|
|
// ":30b" or ":q4_K_M", date stamps) flow through untouched.
|
|
func splitReasoning(s string) (string, string) {
|
|
idx := strings.LastIndex(s, ":")
|
|
if idx < 0 {
|
|
return s, ""
|
|
}
|
|
if lvl := s[idx+1:]; reasoningLevels[lvl] {
|
|
return s[:idx], lvl
|
|
}
|
|
return s, ""
|
|
}
|
|
|
|
// splitReasoningSpec strips the legacy reasoning suffix from every element
|
|
// of a (possibly comma-separated) spec. The returned level is the first
|
|
// non-empty per-element level — majordomo chains carry one request-level
|
|
// reasoning effort, not one per target, so the head element's preference
|
|
// wins. Elements without a suffix are unchanged.
|
|
func splitReasoningSpec(spec string) (string, string) {
|
|
if !strings.Contains(spec, ",") {
|
|
return splitReasoning(strings.TrimSpace(spec))
|
|
}
|
|
parts := strings.Split(spec, ",")
|
|
level := ""
|
|
for i, p := range parts {
|
|
s, l := splitReasoning(strings.TrimSpace(p))
|
|
parts[i] = s
|
|
if level == "" {
|
|
level = l
|
|
}
|
|
}
|
|
return strings.Join(parts, ","), level
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Usage-attribution name resolution
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ResolveModelName returns the model portion of a request string, stripping
|
|
// any reasoning suffix and resolving tier aliases. The result is used for
|
|
// usage attribution (keyed on model name, not provider or reasoning level).
|
|
func ResolveModelName(req string) string {
|
|
// Strip any reasoning-level suffix before resolving — the level is a
|
|
// per-request setting, not part of the model identity.
|
|
req, _ = splitReasoning(req)
|
|
|
|
// Tier expansion: when the request is a tier alias, fold it through the
|
|
// resolver and return the model portion of its current convar value. The
|
|
// empty string is treated as "fast" for compatibility with callers that
|
|
// pre-resolution defaulted to fast.
|
|
if defaultResolver != nil {
|
|
key := req
|
|
if key == "" {
|
|
key = "fast"
|
|
}
|
|
if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" {
|
|
// A tier may resolve to a comma-separated failover chain. Attribute
|
|
// usage to the first (preferred) entry's model name rather than the
|
|
// whole chain string.
|
|
if i := strings.IndexByte(spec, ','); i >= 0 {
|
|
spec = strings.TrimSpace(spec[:i])
|
|
}
|
|
if idx := strings.Index(spec, "/"); idx >= 0 {
|
|
return spec[idx+1:]
|
|
}
|
|
return spec
|
|
}
|
|
}
|
|
|
|
// For non-tier requests, return the model portion after the slash.
|
|
// Static aliases are NOT expanded here beyond the legacy table below:
|
|
// callers that went through ParseModelRequest already carry the
|
|
// concrete spec.
|
|
if idx := strings.Index(req, "/"); idx >= 0 {
|
|
return req[idx+1:]
|
|
}
|
|
|
|
// Legacy shortcut fallback: callers that pass bare names like "sonnet"
|
|
// to ResolveModelName (without going through ParseModelRequest) still
|
|
// need the concrete model name for usage keys.
|
|
if spec, ok := legacyAliasSpecs[req]; ok {
|
|
if idx := strings.Index(spec, "/"); idx >= 0 {
|
|
return spec[idx+1:]
|
|
}
|
|
return spec
|
|
}
|
|
|
|
return req
|
|
}
|
|
|
|
// legacyAliasSpecs maps legacy shortcut names to their full provider/model
|
|
// spec. Registered with the registry as static aliases AND consulted by
|
|
// ResolveModelName for bare-name usage attribution.
|
|
var legacyAliasSpecs = map[string]string{
|
|
"openai": "openai/gpt-4o-mini",
|
|
"gpt-4": "openai/gpt-4",
|
|
"gpt-4o": "openai/gpt-4o",
|
|
"gpt-4o-mini": "openai/gpt-4o-mini",
|
|
"sonnet": "anthropic/claude-sonnet-4-6",
|
|
"sonnet-4.5": "anthropic/claude-sonnet-4-5-20250929",
|
|
"haiku": "anthropic/claude-haiku-4-5-20251001",
|
|
"opus": "anthropic/claude-opus-4-6",
|
|
"gemini": "google/gemini-2.0-flash",
|
|
"gemini-flash": "google/gemini-2.0-flash",
|
|
"gemini-pro": "google/gemini-2.0-pro",
|
|
}
|