executus/model/llms.go

// Package model is executus's config-driven model-access layer over majordomo: it owns the
// package-level *majordomo.Registry (providers with mort's env keys,
// OpenAI-compat presets, lane-aware decoration, the DB-backed tier
// resolver, legacy shortcut aliases, the foreman timeout decorator, and
// failover/health wiring), plus the mort-facing call helpers
// (ParseModelRequest / ParseModelForContext / GenerateWith /
// CallAndExecute / SimpleCall) and usage/trace recording.
//
// The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience:
// majordomo treats model ids as verbatim, so this package strips the
// suffix from specs and tier values and re-applies it per request via
// llm.WithReasoningEffort on a wrapping Model.
package model

import (
	"context"
	"fmt"
	"os"
	"strings"
	"sync"
	"time"

	majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
	"gitea.stevedudenhoeffer.com/steve/majordomo/health"
	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/google"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai"
)

// Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink
// install the host seams, and ParseModelForContext stamps the model name on
// the context (via WithModel) for attribution.

// ---------------------------------------------------------------------------
// Package registry
// ---------------------------------------------------------------------------

// buildConfig carries the knobs Wire feeds into buildRegistry. The zero
// value yields a lane-less registry with majordomo's default failover
// behavior — the bootstrap state tests and pre-Wire code paths run on.
type buildConfig struct {
	lanes LaneRegistry

	// maxRetries maps the llms.failover.max_retries convar onto
	// ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1).
	maxRetries int

	// cooldown maps the llms.failover.cooldown_seconds convar onto
	// health.Config.BaseCooldown. <= 0 keeps the mort default (300s).
	// Note majordomo grows the cooldown exponentially from this base;
	// MaxCooldown is set to max(cooldown, 5m) so the operator dial
	// dominates (a 10m base never gets capped below itself).
	cooldown time.Duration

	// observer receives one event per failover decision (failed attempt,
	// bench, benched-skip). Typically failoverlog.NewObserver(...).
	observer func(majordomo.FailoverEvent)
}

// defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds
// convar default (300s).
const defaultFailoverCooldown = 300 * time.Second

var (
	registryMu sync.RWMutex
	registry   = buildRegistry(buildConfig{})
)

// Registry returns the current package-level majordomo registry. Most
// callers should use ParseModelRequest / ParseModelForContext instead;
// the registry itself is exposed for admin surfaces (health/bench) and
// for tests that need to substitute providers.
func Registry() *majordomo.Registry {
	registryMu.RLock()
	defer registryMu.RUnlock()
	return registry
}

// Health returns the health tracker of the current registry — the live
// source of truth for benched models. Used by the `.failover` commands
// and the failover web UI (see ListBenched/BenchModel/UnbenchModel for
// the mort-flavored facade).
func Health() *health.Tracker {
	return Registry().Health()
}

// setRegistry swaps the package registry. Bench/backoff state of the old
// registry is discarded — Wire is a boot-time operation.
func setRegistry(r *majordomo.Registry) {
	registryMu.Lock()
	defer registryMu.Unlock()
	registry = r
}

// buildRegistry constructs a fully-wired majordomo registry:
//
//   - health/chain config from the failover convars (via cfg),
//   - mort's providers under their nonstandard env keys (OPENAI_KEY,
//     GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated,
//   - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq),
//   - scheme factories for LLM_* env DSNs re-registered so DSN-defined
//     providers (m1, arbitrary foreman targets) are lane-decorated too,
//     with foreman additionally getting the 30-minute model timeout,
//   - the legacy shortcut aliases, and
//   - the delegating tier resolver (reads defaultResolver at Resolve
//     time, so Init() can swap in the DB-backed resolver later).
func buildRegistry(cfg buildConfig) *majordomo.Registry {
	cooldown := cfg.cooldown
	if cooldown <= 0 {
		cooldown = defaultFailoverCooldown
	}
	maxCooldown := cooldown
	if maxCooldown < 5*time.Minute {
		maxCooldown = 5 * time.Minute
	}

	r := majordomo.New(
		// Env DSNs are loaded manually below, AFTER the scheme factories
		// are overridden — New()'s eager scan would otherwise build
		// LLM_*-defined providers with the stock (un-decorated) factories.
		majordomo.WithoutEnvProviders(),
		majordomo.WithHealthConfig(health.Config{
			BaseCooldown: cooldown,
			MaxCooldown:  maxCooldown,
		}),
		majordomo.WithChainConfig(majordomo.ChainConfig{
			TransientRetries: cfg.maxRetries,
			// legacy gollm failed over on request-specific errors (400/413/422)
			// without benching; majordomo fails fast on permanent errors by
			// default. AdvanceOnPermanent preserves the availability-first
			// behavior mort's executors rely on.
			AdvanceOnPermanent: true,
			Observer:           cfg.observer,
		}),
	)

	wrap := func(p llm.Provider) llm.Provider {
		return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout)
	}

	// Core providers with mort's env keys.
	r.RegisterProvider(wrap(openai.New(
		openai.WithAPIKey(os.Getenv("OPENAI_KEY")),
	)))
	r.RegisterProvider(wrap(anthropic.New(
		anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")),
	)))
	r.RegisterProvider(wrap(google.New(
		google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")),
	)))
	r.RegisterProvider(wrap(localOllamaProvider()))
	// ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the
	// provider still registers and errors clearly at call time (parity
	// with the previous behavior).
	r.RegisterProvider(wrap(ollama.Cloud()))

	// OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults.
	for _, preset := range []struct {
		name, baseURL, envKey string
	}{
		{"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"},
		{"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"},
		{"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot
		{"xai", "https://api.x.ai/v1", "XAI_API_KEY"},
		{"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai
		{"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"},
	} {
		r.RegisterProvider(wrap(openai.New(
			openai.WithName(preset.name),
			openai.WithBaseURL(preset.baseURL),
			openai.WithAPIKey(os.Getenv(preset.envKey)),
		)))
	}

	// Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined
	// providers go through the lane decorator like the built-ins.
	//
	// foreman targets are slow local LLMs (large model loads, queued
	// behind other requests), so their models additionally get a hard
	// 30-minute timeout and a matching lane execution backstop — the
	// default 5-minute lane backstop would strangle them.
	r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) {
		p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name))
		return wrapProviderForLane(
			withModelTimeout(p, foremanModelTimeout),
			cfg.lanes,
			foremanLaneExecTimeout,
		), nil
	})
	laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory {
		return func(name string, dsn majordomo.DSN) (llm.Provider, error) {
			p, err := factory(name, dsn)
			if err != nil {
				return nil, err
			}
			return wrap(p), nil
		}
	}
	ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
		return ollama.New(
			ollama.WithName(name),
			ollama.WithBaseURL(dsn.BaseURL()),
			ollama.WithToken(dsn.Token),
		), nil
	})
	r.RegisterScheme("ollama", ollamaScheme)
	r.RegisterScheme("ollama-cloud", ollamaScheme)
	r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
		return openai.New(
			openai.WithName(name),
			openai.WithBaseURL(dsn.BaseURL()),
			openai.WithAPIKey(dsn.Token),
		), nil
	}))
	r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
		return anthropic.New(
			anthropic.WithName(name),
			anthropic.WithBaseURL(dsn.BaseURL()),
			anthropic.WithAPIKey(dsn.Token),
		), nil
	}))
	googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
		return google.New(
			google.WithName(name),
			google.WithBaseURL(dsn.BaseURL()),
			google.WithAPIKey(dsn.Token),
		), nil
	})
	r.RegisterScheme("google", googleScheme)
	r.RegisterScheme("gemini", googleScheme)

	// Eager LLM_* env scan, now with the decorated scheme factories in
	// place. Malformed entries are recorded per-name and surface on use.
	env := make(map[string]string)
	for _, kv := range os.Environ() {
		if k, v, ok := strings.Cut(kv, "="); ok {
			env[k] = v
		}
	}
	_ = r.LoadEnv(env)

	// Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the
	// historical table; kept in sync with legacyAliasSpecs below.
	for name, spec := range legacyAliasSpecs {
		r.RegisterAlias(name, spec)
	}

	// Tier resolver: a delegating closure so Init() and test helpers can
	// swap defaultResolver without rebuilding the registry. The resolver
	// returns specs with the legacy reasoning suffixes already stripped
	// (per chain element); the tier's default reasoning level is applied
	// by ParseModelRequest, not here.
	r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) {
		res := defaultResolver
		if res == nil {
			return "", false
		}
		spec, _, ok := res.Resolve(name)
		return spec, ok
	}))

	return r
}

// localOllamaProvider builds the local Ollama provider, honoring
// OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local
// itself honors OLLAMA_HOST).
func localOllamaProvider() llm.Provider {
	if url := os.Getenv("OLLAMA_BASE_URL"); url != "" {
		return ollama.Local(ollama.WithBaseURL(url))
	}
	return ollama.Local()
}

// ---------------------------------------------------------------------------
// Spec parsing
// ---------------------------------------------------------------------------

// ParseModelRequest resolves a model request string to a ready-to-use Model.
// It handles, in order:
//
//   - empty spec → tier "fast"
//   - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain
//     element (ollama tags like ":30b" or ":cloud" are preserved); the
//     level is applied to every call via llm.WithReasoningEffort
//   - tier aliases (DB-backed convars; a tier value's own suffix becomes
//     the default level when the caller didn't supply one)
//   - legacy shortcut aliases (sonnet, haiku, opus, ...)
//   - provider/model lookup and LLM_* env-DSN fallback (majordomo)
//   - comma-separated failover chains with health-tracked bench/backoff
//
// The returned Model is instrumented: token usage from every successful
// Generate is recorded to the package usage recorder automatically. Do
// NOT additionally call RecordUsage on responses from a parsed model.
func ParseModelRequest(spec string) (majordomo.Model, error) {
	spec = strings.TrimSpace(spec)
	if spec == "" {
		spec = "fast"
	}

	clean, level := splitReasoningSpec(spec)

	// Tier default reasoning: when the (suffix-free) spec is exactly a
	// tier name and the caller didn't ask for a level, the tier value's
	// own suffix (e.g. "anthropic/claude-opus-4-6:high") applies.
	if level == "" && defaultResolver != nil {
		if _, tierLevel, ok := defaultResolver.Resolve(clean); ok {
			level = tierLevel
		}
	}

	m, err := Registry().Parse(clean)
	if err != nil {
		return nil, fmt.Errorf("model %q: %w", spec, err)
	}
	if level != "" {
		m = &reasoningModel{inner: m, level: level}
	}
	return &instrumentedModel{inner: m}, nil
}

// ParseModelForContext combines ParseModelRequest with llmusage.WithModel so
// that the resolved model name is recorded in the context for usage tracking.
// Prefer this over bare ParseModelRequest in all new code.
func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) {
	model, err := ParseModelRequest(req)
	if err != nil {
		return ctx, nil, err
	}
	ctx = WithModel(ctx, ResolveModelName(req))
	return ctx, model, nil
}

// reasoningModel applies a default reasoning effort to every request that
// doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix
// dialect resolves to this wrapper because majordomo treats model ids as
// verbatim (no suffix stripping).
type reasoningModel struct {
	inner llm.Model
	level string
}

func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
	req = req.Apply(opts...)
	if req.ReasoningEffort == "" {
		req.ReasoningEffort = m.level
	}
	return m.inner.Generate(ctx, req)
}

func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
	req = req.Apply(opts...)
	if req.ReasoningEffort == "" {
		req.ReasoningEffort = m.level
	}
	return m.inner.Stream(ctx, req)
}

func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }

// ---------------------------------------------------------------------------
// Reasoning-suffix dialect
// ---------------------------------------------------------------------------

// reasoningLevels is the set of recognized legacy suffix values.
var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true}

// splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off
// a single model request string. Returns the input unchanged and "" when no
// recognized level is present, so non-reasoning suffixes (ollama tags like
// ":30b" or ":q4_K_M", date stamps) flow through untouched.
func splitReasoning(s string) (string, string) {
	idx := strings.LastIndex(s, ":")
	if idx < 0 {
		return s, ""
	}
	if lvl := s[idx+1:]; reasoningLevels[lvl] {
		return s[:idx], lvl
	}
	return s, ""
}

// splitReasoningSpec strips the legacy reasoning suffix from every element
// of a (possibly comma-separated) spec. The returned level is the first
// non-empty per-element level — majordomo chains carry one request-level
// reasoning effort, not one per target, so the head element's preference
// wins. Elements without a suffix are unchanged.
func splitReasoningSpec(spec string) (string, string) {
	if !strings.Contains(spec, ",") {
		return splitReasoning(strings.TrimSpace(spec))
	}
	parts := strings.Split(spec, ",")
	level := ""
	for i, p := range parts {
		s, l := splitReasoning(strings.TrimSpace(p))
		parts[i] = s
		if level == "" {
			level = l
		}
	}
	return strings.Join(parts, ","), level
}

// ---------------------------------------------------------------------------
// Usage-attribution name resolution
// ---------------------------------------------------------------------------

// ResolveModelName returns the model portion of a request string, stripping
// any reasoning suffix and resolving tier aliases. The result is used for
// usage attribution (keyed on model name, not provider or reasoning level).
func ResolveModelName(req string) string {
	// Strip any reasoning-level suffix before resolving — the level is a
	// per-request setting, not part of the model identity.
	req, _ = splitReasoning(req)

	// Tier expansion: when the request is a tier alias, fold it through the
	// resolver and return the model portion of its current convar value. The
	// empty string is treated as "fast" for compatibility with callers that
	// pre-resolution defaulted to fast.
	if defaultResolver != nil {
		key := req
		if key == "" {
			key = "fast"
		}
		if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" {
			// A tier may resolve to a comma-separated failover chain. Attribute
			// usage to the first (preferred) entry's model name rather than the
			// whole chain string.
			if i := strings.IndexByte(spec, ','); i >= 0 {
				spec = strings.TrimSpace(spec[:i])
			}
			if idx := strings.Index(spec, "/"); idx >= 0 {
				return spec[idx+1:]
			}
			return spec
		}
	}

	// For non-tier requests, return the model portion after the slash.
	// Static aliases are NOT expanded here beyond the legacy table below:
	// callers that went through ParseModelRequest already carry the
	// concrete spec.
	if idx := strings.Index(req, "/"); idx >= 0 {
		return req[idx+1:]
	}

	// Legacy shortcut fallback: callers that pass bare names like "sonnet"
	// to ResolveModelName (without going through ParseModelRequest) still
	// need the concrete model name for usage keys.
	if spec, ok := legacyAliasSpecs[req]; ok {
		if idx := strings.Index(spec, "/"); idx >= 0 {
			return spec[idx+1:]
		}
		return spec
	}

	return req
}

// legacyAliasSpecs maps legacy shortcut names to their full provider/model
// spec. Registered with the registry as static aliases AND consulted by
// ResolveModelName for bare-name usage attribution.
var legacyAliasSpecs = map[string]string{
	"openai":       "openai/gpt-4o-mini",
	"gpt-4":        "openai/gpt-4",
	"gpt-4o":       "openai/gpt-4o",
	"gpt-4o-mini":  "openai/gpt-4o-mini",
	"sonnet":       "anthropic/claude-sonnet-4-6",
	"sonnet-4.5":   "anthropic/claude-sonnet-4-5-20250929",
	"haiku":        "anthropic/claude-haiku-4-5-20251001",
	"opus":         "anthropic/claude-opus-4-6",
	"gemini":       "google/gemini-2.0-flash",
	"gemini-flash": "google/gemini-2.0-flash",
	"gemini-pro":   "google/gemini-2.0-pro",
}