Files
majordomo/registry.go
T
steve dcd004289f feat: foundations — canonical types, Parse grammar, env DSNs, health, chains
Phase 1 of the majordomo build:
- llm/ canonical contract (messages, parts, tools, capabilities, streaming,
  Model/Provider, error classification)
- health/ clock-injected tracker (threshold bench, exponential capped
  cooldown, reset-on-success)
- root Registry + Parse (verbatim model ids, inline recursive alias
  expansion with cycle detection, chain dedup), LLM_* env-DSN providers
  (go-llm parity: lazy fallback + eager LoadEnv), health-aware chain
  executor behind the Model interface
- provider/fake scriptable test provider; hermetic test suite incl. the
  trailing-thinking chain and foreman:// env loading
- ADRs 0001-0008, CLAUDE.md, README (honest matrix), CI workflow,
  docs/phase-1-design.md

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 12:35:34 +02:00

228 lines
6.8 KiB
Go

package majordomo
import (
"fmt"
"os"
"strings"
"sync"
"time"
"gitea.stevedudenhoeffer.com/steve/majordomo/health"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// Registry owns providers, aliases/tiers, env-DSN scheme factories, the
// model health tracker, and Parse. It is safe for concurrent use.
type Registry struct {
mu sync.RWMutex
providers map[string]llm.Provider
aliases map[string]string
schemes map[string]SchemeFactory
// envErrs records LLM_* entries that failed to load so the failure
// surfaces when (and only when) that provider name is actually used.
envErrs map[string]error
tracker *health.Tracker
chainCfg ChainConfig
envLookup func(string) string
}
// SchemeFactory builds a provider instance from an env DSN. name is the
// registry name the provider will be registered under (e.g. "m1" for
// LLM_M1); dsn carries the scheme, credential, and host.
type SchemeFactory func(name string, dsn DSN) (llm.Provider, error)
// ChainConfig tunes failover-chain execution.
type ChainConfig struct {
// TransientRetries is the number of immediate same-target retries after
// a transient error. 0 selects the default (1); negative disables
// retries.
TransientRetries int
// AdvanceOnPermanent, when true, makes the chain advance to the next
// element on permanent errors other than model-not-found instead of
// returning immediately. Model-not-found always advances (without
// penalizing health); auth/malformed errors default to fail-fast
// because failing over cannot help a bad request.
AdvanceOnPermanent bool
// Classify overrides the default error classifier (llm.Classify).
Classify func(error) llm.ErrorClass
}
// DefaultTransientRetries is the default number of same-target retries
// after a single transient error.
const DefaultTransientRetries = 1
func (c ChainConfig) retries() int {
switch {
case c.TransientRetries < 0:
return 0
case c.TransientRetries == 0:
return DefaultTransientRetries
default:
return c.TransientRetries
}
}
func (c ChainConfig) classify(err error) llm.ErrorClass {
if c.Classify != nil {
return c.Classify(err)
}
return llm.Classify(err)
}
type registryConfig struct {
health health.Config
chain ChainConfig
envLookup func(string) string
environ func() []string
skipEnv bool
}
// RegistryOption configures New.
type RegistryOption func(*registryConfig)
// WithHealthConfig overrides the health tracker configuration
// (thresholds, cooldowns, clock).
func WithHealthConfig(cfg health.Config) RegistryOption {
return func(rc *registryConfig) { rc.health = cfg }
}
// WithChainConfig overrides failover-chain behavior (retry count,
// permanent-error policy, classifier).
func WithChainConfig(cfg ChainConfig) RegistryOption {
return func(rc *registryConfig) { rc.chain = cfg }
}
// WithClock injects a clock for the health tracker; tests use a fake clock
// to step through backoff windows deterministically.
func WithClock(clock func() time.Time) RegistryOption {
return func(rc *registryConfig) { rc.health.Clock = clock }
}
// WithEnvLookup injects the env-var lookup used for lazy LLM_* resolution
// during Parse (defaults to os.Getenv). Tests use this to avoid touching
// the process environment.
func WithEnvLookup(lookup func(string) string) RegistryOption {
return func(rc *registryConfig) { rc.envLookup = lookup }
}
// WithoutEnvProviders disables the eager LLM_* scan at construction time.
// Lazy per-name resolution during Parse still works (use WithEnvLookup to
// control it in tests).
func WithoutEnvProviders() RegistryOption {
return func(rc *registryConfig) { rc.skipEnv = true }
}
// New creates a Registry with all built-in providers and scheme factories
// registered, then loads LLM_* env-DSN providers from the process
// environment (unless WithoutEnvProviders is given). Malformed LLM_* entries
// do not fail construction; the error surfaces if that provider name is
// referenced in Parse.
func New(opts ...RegistryOption) *Registry {
cfg := registryConfig{
envLookup: os.Getenv,
environ: os.Environ,
}
for _, opt := range opts {
opt(&cfg)
}
r := &Registry{
providers: make(map[string]llm.Provider),
aliases: make(map[string]string),
schemes: make(map[string]SchemeFactory),
envErrs: make(map[string]error),
tracker: health.NewTracker(cfg.health),
chainCfg: cfg.chain,
envLookup: cfg.envLookup,
}
registerBuiltins(r)
if !cfg.skipEnv {
env := make(map[string]string)
for _, kv := range cfg.environ() {
if k, v, ok := strings.Cut(kv, "="); ok {
env[k] = v
}
}
// Errors are recorded per-name and surfaced on use; see envErrs.
_ = r.LoadEnv(env)
}
return r
}
// RegisterProvider adds or replaces a provider under its Name().
func (r *Registry) RegisterProvider(p llm.Provider) {
r.mu.Lock()
defer r.mu.Unlock()
r.providers[p.Name()] = p
}
// RegisterAlias maps a bare name (no slash) to a spec. The spec may be a
// single target, another alias, or a comma-separated chain; Parse expands
// aliases inline and recursively, with cycle detection.
func (r *Registry) RegisterAlias(name, spec string) {
r.mu.Lock()
defer r.mu.Unlock()
r.aliases[name] = spec
}
// RegisterScheme adds or replaces an env-DSN scheme factory, letting
// consumers wire custom provider kinds into LLM_* definitions.
func (r *Registry) RegisterScheme(scheme string, factory SchemeFactory) {
r.mu.Lock()
defer r.mu.Unlock()
r.schemes[scheme] = factory
}
// Provider returns the registered provider with the given name, if any.
func (r *Registry) Provider(name string) (llm.Provider, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
p, ok := r.providers[name]
return p, ok
}
// Health exposes the registry's health tracker (read-mostly; useful for
// diagnostics and tests).
func (r *Registry) Health() *health.Tracker { return r.tracker }
// providerFor resolves a provider name: registered providers first, then a
// recorded env-load error for that name, then lazy LLM_* env resolution
// (go-llm parity: "m5" → env LLM_M5, "my-prov" → LLM_MY_PROV). Providers
// resolved lazily are cached in the registry.
func (r *Registry) providerFor(name string) (llm.Provider, error) {
r.mu.RLock()
p, ok := r.providers[name]
envErr := r.envErrs[name]
r.mu.RUnlock()
if ok {
return p, nil
}
if envErr != nil {
return nil, envErr
}
envKey := "LLM_" + strings.ToUpper(strings.ReplaceAll(name, "-", "_"))
envVal := r.envLookup(envKey)
if envVal == "" {
return nil, fmt.Errorf("%w: %q (checked registry and %s env var)", ErrUnknownProvider, name, envKey)
}
p, err := r.providerFromDSN(name, envVal)
if err != nil {
return nil, fmt.Errorf("parse %s: %w", envKey, err)
}
r.mu.Lock()
defer r.mu.Unlock()
// Another goroutine may have raced us here; keep the first registration.
if existing, ok := r.providers[name]; ok {
return existing, nil
}
r.providers[name] = p
return p, nil
}