// Package model is executus's config-driven model-access layer over majordomo: it owns the // package-level *majordomo.Registry (providers with mort's env keys, // OpenAI-compat presets, lane-aware decoration, the DB-backed tier // resolver, legacy shortcut aliases, the foreman timeout decorator, and // failover/health wiring), plus the mort-facing call helpers // (ParseModelRequest / ParseModelForContext / GenerateWith / // CallAndExecute / SimpleCall) and usage/trace recording. // // The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience: // majordomo treats model ids as verbatim, so this package strips the // suffix from specs and tier values and re-applies it per request via // llm.WithReasoningEffort on a wrapping Model. package model import ( "context" "fmt" "os" "strings" "sync" "time" majordomo "gitea.stevedudenhoeffer.com/steve/majordomo" "gitea.stevedudenhoeffer.com/steve/majordomo/health" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" "gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic" "gitea.stevedudenhoeffer.com/steve/majordomo/provider/google" "gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama" "gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai" ) // Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink // install the host seams, and ParseModelForContext stamps the model name on // the context (via WithModel) for attribution. // --------------------------------------------------------------------------- // Package registry // --------------------------------------------------------------------------- // buildConfig carries the knobs Wire feeds into buildRegistry. The zero // value yields a lane-less registry with majordomo's default failover // behavior — the bootstrap state tests and pre-Wire code paths run on. type buildConfig struct { lanes LaneRegistry // maxRetries maps the llms.failover.max_retries convar onto // ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1). maxRetries int // cooldown maps the llms.failover.cooldown_seconds convar onto // health.Config.BaseCooldown. <= 0 keeps the mort default (300s). // Note majordomo grows the cooldown exponentially from this base; // MaxCooldown is set to max(cooldown, 5m) so the operator dial // dominates (a 10m base never gets capped below itself). cooldown time.Duration // observer receives one event per failover decision (failed attempt, // bench, benched-skip). Typically failoverlog.NewObserver(...). observer func(majordomo.FailoverEvent) } // defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds // convar default (300s). const defaultFailoverCooldown = 300 * time.Second var ( registryMu sync.RWMutex registry = buildRegistry(buildConfig{}) ) // Registry returns the current package-level majordomo registry. Most // callers should use ParseModelRequest / ParseModelForContext instead; // the registry itself is exposed for admin surfaces (health/bench) and // for tests that need to substitute providers. func Registry() *majordomo.Registry { registryMu.RLock() defer registryMu.RUnlock() return registry } // Health returns the health tracker of the current registry — the live // source of truth for benched models. Used by the `.failover` commands // and the failover web UI (see ListBenched/BenchModel/UnbenchModel for // the mort-flavored facade). func Health() *health.Tracker { return Registry().Health() } // setRegistry swaps the package registry. Bench/backoff state of the old // registry is discarded — Wire is a boot-time operation. func setRegistry(r *majordomo.Registry) { registryMu.Lock() defer registryMu.Unlock() registry = r } // buildRegistry constructs a fully-wired majordomo registry: // // - health/chain config from the failover convars (via cfg), // - mort's providers under their nonstandard env keys (OPENAI_KEY, // GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated, // - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq), // - scheme factories for LLM_* env DSNs re-registered so DSN-defined // providers (m1, arbitrary foreman targets) are lane-decorated too, // with foreman additionally getting the 30-minute model timeout, // - the legacy shortcut aliases, and // - the delegating tier resolver (reads defaultResolver at Resolve // time, so Init() can swap in the DB-backed resolver later). func buildRegistry(cfg buildConfig) *majordomo.Registry { cooldown := cfg.cooldown if cooldown <= 0 { cooldown = defaultFailoverCooldown } maxCooldown := cooldown if maxCooldown < 5*time.Minute { maxCooldown = 5 * time.Minute } r := majordomo.New( // Env DSNs are loaded manually below, AFTER the scheme factories // are overridden — New()'s eager scan would otherwise build // LLM_*-defined providers with the stock (un-decorated) factories. majordomo.WithoutEnvProviders(), majordomo.WithHealthConfig(health.Config{ BaseCooldown: cooldown, MaxCooldown: maxCooldown, }), majordomo.WithChainConfig(majordomo.ChainConfig{ TransientRetries: cfg.maxRetries, // legacy gollm failed over on request-specific errors (400/413/422) // without benching; majordomo fails fast on permanent errors by // default. AdvanceOnPermanent preserves the availability-first // behavior mort's executors rely on. AdvanceOnPermanent: true, Observer: cfg.observer, }), ) wrap := func(p llm.Provider) llm.Provider { return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout) } // Core providers with mort's env keys. r.RegisterProvider(wrap(openai.New( openai.WithAPIKey(os.Getenv("OPENAI_KEY")), ))) r.RegisterProvider(wrap(anthropic.New( anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")), ))) r.RegisterProvider(wrap(google.New( google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")), ))) r.RegisterProvider(wrap(localOllamaProvider())) // ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the // provider still registers and errors clearly at call time (parity // with the previous behavior). r.RegisterProvider(wrap(ollama.Cloud())) // OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults. for _, preset := range []struct { name, baseURL, envKey string }{ {"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"}, {"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, {"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot {"xai", "https://api.x.ai/v1", "XAI_API_KEY"}, {"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai {"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"}, } { r.RegisterProvider(wrap(openai.New( openai.WithName(preset.name), openai.WithBaseURL(preset.baseURL), openai.WithAPIKey(os.Getenv(preset.envKey)), ))) } // Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined // providers go through the lane decorator like the built-ins. // // foreman targets are slow local LLMs (large model loads, queued // behind other requests), so their models additionally get a hard // 30-minute timeout and a matching lane execution backstop — the // default 5-minute lane backstop would strangle them. r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) { p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name)) return wrapProviderForLane( withModelTimeout(p, foremanModelTimeout), cfg.lanes, foremanLaneExecTimeout, ), nil }) laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory { return func(name string, dsn majordomo.DSN) (llm.Provider, error) { p, err := factory(name, dsn) if err != nil { return nil, err } return wrap(p), nil } } ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { return ollama.New( ollama.WithName(name), ollama.WithBaseURL(dsn.BaseURL()), ollama.WithToken(dsn.Token), ), nil }) r.RegisterScheme("ollama", ollamaScheme) r.RegisterScheme("ollama-cloud", ollamaScheme) r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { return openai.New( openai.WithName(name), openai.WithBaseURL(dsn.BaseURL()), openai.WithAPIKey(dsn.Token), ), nil })) r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { return anthropic.New( anthropic.WithName(name), anthropic.WithBaseURL(dsn.BaseURL()), anthropic.WithAPIKey(dsn.Token), ), nil })) googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { return google.New( google.WithName(name), google.WithBaseURL(dsn.BaseURL()), google.WithAPIKey(dsn.Token), ), nil }) r.RegisterScheme("google", googleScheme) r.RegisterScheme("gemini", googleScheme) // Eager LLM_* env scan, now with the decorated scheme factories in // place. Malformed entries are recorded per-name and surface on use. env := make(map[string]string) for _, kv := range os.Environ() { if k, v, ok := strings.Cut(kv, "="); ok { env[k] = v } } _ = r.LoadEnv(env) // Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the // historical table; kept in sync with legacyAliasSpecs below. for name, spec := range legacyAliasSpecs { r.RegisterAlias(name, spec) } // Tier resolver: a delegating closure so Init() and test helpers can // swap defaultResolver without rebuilding the registry. The resolver // returns specs with the legacy reasoning suffixes already stripped // (per chain element); the tier's default reasoning level is applied // by ParseModelRequest, not here. r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) { res := defaultResolver if res == nil { return "", false } spec, _, ok := res.Resolve(name) return spec, ok })) return r } // localOllamaProvider builds the local Ollama provider, honoring // OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local // itself honors OLLAMA_HOST). func localOllamaProvider() llm.Provider { if url := os.Getenv("OLLAMA_BASE_URL"); url != "" { return ollama.Local(ollama.WithBaseURL(url)) } return ollama.Local() } // --------------------------------------------------------------------------- // Spec parsing // --------------------------------------------------------------------------- // ParseModelRequest resolves a model request string to a ready-to-use Model. // It handles, in order: // // - empty spec → tier "fast" // - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain // element (ollama tags like ":30b" or ":cloud" are preserved); the // level is applied to every call via llm.WithReasoningEffort // - tier aliases (DB-backed convars; a tier value's own suffix becomes // the default level when the caller didn't supply one) // - legacy shortcut aliases (sonnet, haiku, opus, ...) // - provider/model lookup and LLM_* env-DSN fallback (majordomo) // - comma-separated failover chains with health-tracked bench/backoff // // The returned Model is instrumented: token usage from every successful // Generate is recorded to the package usage recorder automatically. Do // NOT additionally call RecordUsage on responses from a parsed model. func ParseModelRequest(spec string) (majordomo.Model, error) { spec = strings.TrimSpace(spec) if spec == "" { spec = "fast" } clean, level := splitReasoningSpec(spec) // Tier default reasoning: when the (suffix-free) spec is exactly a // tier name and the caller didn't ask for a level, the tier value's // own suffix (e.g. "anthropic/claude-opus-4-6:high") applies. if level == "" && defaultResolver != nil { if _, tierLevel, ok := defaultResolver.Resolve(clean); ok { level = tierLevel } } m, err := Registry().Parse(clean) if err != nil { return nil, fmt.Errorf("model %q: %w", spec, err) } if level != "" { m = &reasoningModel{inner: m, level: level} } return &instrumentedModel{inner: m}, nil } // ParseModelForContext combines ParseModelRequest with llmusage.WithModel so // that the resolved model name is recorded in the context for usage tracking. // Prefer this over bare ParseModelRequest in all new code. func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) { model, err := ParseModelRequest(req) if err != nil { return ctx, nil, err } ctx = WithModel(ctx, ResolveModelName(req)) return ctx, model, nil } // reasoningModel applies a default reasoning effort to every request that // doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix // dialect resolves to this wrapper because majordomo treats model ids as // verbatim (no suffix stripping). type reasoningModel struct { inner llm.Model level string } func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { req = req.Apply(opts...) if req.ReasoningEffort == "" { req.ReasoningEffort = m.level } return m.inner.Generate(ctx, req) } func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { req = req.Apply(opts...) if req.ReasoningEffort == "" { req.ReasoningEffort = m.level } return m.inner.Stream(ctx, req) } func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } // --------------------------------------------------------------------------- // Reasoning-suffix dialect // --------------------------------------------------------------------------- // reasoningLevels is the set of recognized legacy suffix values. var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true} // splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off // a single model request string. Returns the input unchanged and "" when no // recognized level is present, so non-reasoning suffixes (ollama tags like // ":30b" or ":q4_K_M", date stamps) flow through untouched. func splitReasoning(s string) (string, string) { idx := strings.LastIndex(s, ":") if idx < 0 { return s, "" } if lvl := s[idx+1:]; reasoningLevels[lvl] { return s[:idx], lvl } return s, "" } // splitReasoningSpec strips the legacy reasoning suffix from every element // of a (possibly comma-separated) spec. The returned level is the first // non-empty per-element level — majordomo chains carry one request-level // reasoning effort, not one per target, so the head element's preference // wins. Elements without a suffix are unchanged. func splitReasoningSpec(spec string) (string, string) { if !strings.Contains(spec, ",") { return splitReasoning(strings.TrimSpace(spec)) } parts := strings.Split(spec, ",") level := "" for i, p := range parts { s, l := splitReasoning(strings.TrimSpace(p)) parts[i] = s if level == "" { level = l } } return strings.Join(parts, ","), level } // --------------------------------------------------------------------------- // Usage-attribution name resolution // --------------------------------------------------------------------------- // ResolveModelName returns the model portion of a request string, stripping // any reasoning suffix and resolving tier aliases. The result is used for // usage attribution (keyed on model name, not provider or reasoning level). func ResolveModelName(req string) string { // Strip any reasoning-level suffix before resolving — the level is a // per-request setting, not part of the model identity. req, _ = splitReasoning(req) // Tier expansion: when the request is a tier alias, fold it through the // resolver and return the model portion of its current convar value. The // empty string is treated as "fast" for compatibility with callers that // pre-resolution defaulted to fast. if defaultResolver != nil { key := req if key == "" { key = "fast" } if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" { // A tier may resolve to a comma-separated failover chain. Attribute // usage to the first (preferred) entry's model name rather than the // whole chain string. if i := strings.IndexByte(spec, ','); i >= 0 { spec = strings.TrimSpace(spec[:i]) } if idx := strings.Index(spec, "/"); idx >= 0 { return spec[idx+1:] } return spec } } // For non-tier requests, return the model portion after the slash. // Static aliases are NOT expanded here beyond the legacy table below: // callers that went through ParseModelRequest already carry the // concrete spec. if idx := strings.Index(req, "/"); idx >= 0 { return req[idx+1:] } // Legacy shortcut fallback: callers that pass bare names like "sonnet" // to ResolveModelName (without going through ParseModelRequest) still // need the concrete model name for usage keys. if spec, ok := legacyAliasSpecs[req]; ok { if idx := strings.Index(spec, "/"); idx >= 0 { return spec[idx+1:] } return spec } return req } // legacyAliasSpecs maps legacy shortcut names to their full provider/model // spec. Registered with the registry as static aliases AND consulted by // ResolveModelName for bare-name usage attribution. var legacyAliasSpecs = map[string]string{ "openai": "openai/gpt-4o-mini", "gpt-4": "openai/gpt-4", "gpt-4o": "openai/gpt-4o", "gpt-4o-mini": "openai/gpt-4o-mini", "sonnet": "anthropic/claude-sonnet-4-6", "sonnet-4.5": "anthropic/claude-sonnet-4-5-20250929", "haiku": "anthropic/claude-haiku-4-5-20251001", "opus": "anthropic/claude-opus-4-6", "gemini": "google/gemini-2.0-flash", "gemini-flash": "google/gemini-2.0-flash", "gemini-pro": "google/gemini-2.0-pro", }