P1: model layer (convar->config inversion) + llmmeta

Lifts mort's pkg/logic/llms into executus/model, decoupled from mort: - tiers.go: the tier resolver now reads a host-supplied config.Source under "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults, ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the resolution mechanism (cache, reasoning-suffix dialect, chain validation) is generic. No tier names hard-coded in the harness. - sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into UsageSink / TraceSink seams + a model-owned Span, with nil-safe context attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser). Both sinks optional (nil = off) so a light host records nothing. - lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf. - call.go keeps GenerateWith[T] (instrumented structured output) — this is the structured-output primitive; no separate structured/ package. - llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry + ledger). Its tests configure a minimal tier table via TestMain. New tests cover the inversion: config overrides fallback, tier registration, reasoning-suffix survival, nested-tier rejection, nil-sink no-ops. Full module: go build/vet/test -race green; core go.sum still free of gorm/redis/discordgo/sqlite. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:47:13 -04:00
parent 741d7816ed
commit b424261aca
17 changed files with 3698 additions and 3 deletions
@@ -0,0 +1,453 @@
+// V15.4 — Ollama Cloud dynamic context-length sync.
+//
+// Why: the static map in context_limits.go has to be hand-maintained
+// for every new Ollama Cloud model. Cloud ships new models monthly,
+// and a missing entry silently disables compaction for runs on that
+// model (compactionThresholdForModel returns 0 on MaxContextTokens
+// miss). Dynamic sync removes the maintenance burden and means new
+// cloud models work out-of-the-box.
+//
+// How: at boot, mort kicks off a CloudOllamaLimitCache.RefreshAll in a
+// background goroutine. RefreshAll calls /api/tags to list every
+// available cloud model, then concurrently calls /api/show for each
+// to extract `<family>.context_length` from the response's model_info
+// map. The cache is consulted by the executor's
+// compactionThresholdForModel via the cache-aware
+// MaxContextTokensWithCache helper.
+//
+// Periodic refresh: a daily ticker re-runs RefreshAll so newly
+// released models surface without a mort restart. The interval is
+// intentionally not configurable — cloud model context lengths don't
+// change for a given tag (only the tag pointer can move, e.g. :cloud
+// → larger model), so daily is conservative.
+
+package model
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+)
+
+// defaultCloudEndpoint is the public Ollama Cloud base URL. Override
+// in tests via NewCloudOllamaLimitCache's endpoint arg.
+const defaultCloudEndpoint = "https://ollama.com"
+
+// CloudOllamaLimitCache holds context-length values for Ollama Cloud
+// models, populated dynamically via /api/tags + /api/show. Construct
+// with NewCloudOllamaLimitCache. Safe for concurrent use.
+//
+// Empty when OLLAMA_API_KEY is unset — Refresh returns a clear error
+// and the cache stays empty. Lookups return (0, false) and callers
+// fall back to the static map / disabled compaction.
+type CloudOllamaLimitCache struct {
+	mu       sync.RWMutex
+	limit    map[string]int
+	negative map[string]time.Time // model → fetch-failure time (for TTL)
+
+	endpoint   string
+	apiKey     string
+	httpClient *http.Client
+
+	// refreshConcurrency caps the number of concurrent /api/show calls
+	// during RefreshAll. Default 8 — enough to finish a ~50-model
+	// catalog in well under a minute without hammering Cloud.
+	refreshConcurrency int
+
+	// negativeTTL is how long a /api/show miss is cached before we
+	// retry. Prevents hammering Cloud on a typo or recently-removed
+	// model. Default 10 minutes.
+	negativeTTL time.Duration
+}
+
+// NewCloudOllamaLimitCache constructs a fresh cache. apiKey can be
+// empty — RefreshAll then returns an error and the cache stays empty.
+// endpoint defaults to https://ollama.com when empty. httpClient
+// defaults to a 15s-timeout client.
+func NewCloudOllamaLimitCache(endpoint, apiKey string, httpClient *http.Client) *CloudOllamaLimitCache {
+	if strings.TrimSpace(endpoint) == "" {
+		endpoint = defaultCloudEndpoint
+	}
+	endpoint = strings.TrimRight(endpoint, "/")
+	if httpClient == nil {
+		httpClient = &http.Client{Timeout: 15 * time.Second}
+	}
+	return &CloudOllamaLimitCache{
+		limit:              make(map[string]int),
+		negative:           make(map[string]time.Time),
+		endpoint:           endpoint,
+		apiKey:             apiKey,
+		httpClient:         httpClient,
+		refreshConcurrency: 8,
+		negativeTTL:        10 * time.Minute,
+	}
+}
+
+// SetNegativeTTL overrides the negative-cache lifetime. Tests use this
+// to control retry behaviour without sleeping.
+func (c *CloudOllamaLimitCache) SetNegativeTTL(d time.Duration) {
+	if c == nil || d < 0 {
+		return
+	}
+	c.mu.Lock()
+	c.negativeTTL = d
+	c.mu.Unlock()
+}
+
+// Lookup returns the cached context length for an Ollama Cloud model
+// name (e.g. "qwen3.5:cloud", "qwen3-coder:480b"). Returns (0, false)
+// on miss. Lookup never makes HTTP calls — it's the hot path consulted
+// by the executor before every run.
+//
+// modelName accepts either the bare model:tag form or the prefixed
+// "ollama-cloud/model:tag" form; the prefix is stripped.
+func (c *CloudOllamaLimitCache) Lookup(modelName string) (int, bool) {
+	if c == nil {
+		return 0, false
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return 0, false
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	v, ok := c.limit[key]
+	return v, ok
+}
+
+// Size returns the number of cached entries. Useful for logging /
+// health checks.
+func (c *CloudOllamaLimitCache) Size() int {
+	if c == nil {
+		return 0
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return len(c.limit)
+}
+
+// LookupOrFetch returns the cached context length OR, on miss, makes a
+// single /api/show call to populate the cache. Negative results
+// (model not found, /api/show returns no context_length) are cached
+// for negativeTTL to prevent hammering Cloud on a typo. Returns
+// (0, false) when the model is genuinely unknown and (size, true) on
+// any successful resolve.
+//
+// Why this exists: Ollama Cloud's /api/tags lists canonical model
+// names only (e.g. "qwen3.5:397b") but accepts aliases on /api/show
+// (e.g. "qwen3.5:cloud" → same 397b model). The boot-time RefreshAll
+// only sees the canonical names, so common aliases miss the cache.
+// LookupOrFetch fills the gap.
+//
+// The cache is therefore self-healing: any unknown model gets one
+// live /api/show call, the result lands in the cache, and subsequent
+// runs hit immediately. Periodic RefreshAll overwrites everything
+// with the canonical-name results but additionally-fetched aliases
+// linger as positive entries.
+func (c *CloudOllamaLimitCache) LookupOrFetch(ctx context.Context, modelName string) (int, bool) {
+	if c == nil {
+		return 0, false
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return 0, false
+	}
+	// Fast path: positive hit.
+	c.mu.RLock()
+	if v, ok := c.limit[key]; ok {
+		c.mu.RUnlock()
+		return v, true
+	}
+	// Negative cache check.
+	if t, ok := c.negative[key]; ok && time.Since(t) < c.negativeTTL {
+		c.mu.RUnlock()
+		return 0, false
+	}
+	c.mu.RUnlock()
+	// No API key configured → can't fetch. Don't write a negative
+	// entry (when the key gets configured later we want the next call
+	// to re-try immediately).
+	if strings.TrimSpace(c.apiKey) == "" {
+		return 0, false
+	}
+	// Slow path: live /api/show.
+	n, err := c.fetchContextLength(ctx, key)
+	if err != nil || n <= 0 {
+		slog.Debug("cloud limit cache: lazy fetch miss",
+			"model", key, "err", err)
+		c.mu.Lock()
+		c.negative[key] = time.Now()
+		c.mu.Unlock()
+		return 0, false
+	}
+	c.set(key, n)
+	slog.Info("cloud limit cache: lazy fetch hit", "model", key, "context_length", n)
+	return n, true
+}
+
+// set stores a context length. n <= 0 is a no-op.
+func (c *CloudOllamaLimitCache) set(modelName string, n int) {
+	if c == nil || n <= 0 {
+		return
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return
+	}
+	c.mu.Lock()
+	c.limit[key] = n
+	c.mu.Unlock()
+}
+
+// RefreshAll queries /api/tags then concurrently calls /api/show for
+// every listed model, populating the cache. Returns the number of
+// models successfully cached and the first error encountered (a
+// /api/tags failure aborts; individual /api/show failures are logged
+// but don't abort the whole refresh).
+//
+// Safe to call repeatedly. Cache entries are overwritten with the
+// fresh values; entries for models that have been removed from Cloud
+// are NOT pruned (cheap to keep; pruning risks dropping an entry just
+// before a run that needs it).
+func (c *CloudOllamaLimitCache) RefreshAll(ctx context.Context) (int, error) {
+	if c == nil {
+		return 0, fmt.Errorf("cloud limit cache: nil receiver")
+	}
+	if strings.TrimSpace(c.apiKey) == "" {
+		return 0, fmt.Errorf("cloud limit cache: OLLAMA_API_KEY unset")
+	}
+	tags, err := c.fetchTags(ctx)
+	if err != nil {
+		return 0, fmt.Errorf("cloud limit cache: /api/tags: %w", err)
+	}
+
+	concurrency := c.refreshConcurrency
+	if concurrency <= 0 {
+		concurrency = 8
+	}
+	sem := make(chan struct{}, concurrency)
+	var wg sync.WaitGroup
+	var (
+		mu      sync.Mutex
+		success int
+	)
+	for _, name := range tags {
+		name := name
+		wg.Add(1)
+		sem <- struct{}{}
+		go func() {
+			defer wg.Done()
+			defer func() { <-sem }()
+			ctxLen, ferr := c.fetchContextLength(ctx, name)
+			if ferr != nil {
+				slog.Debug("cloud limit cache: /api/show miss",
+					"model", name, "err", ferr)
+				return
+			}
+			c.set(name, ctxLen)
+			mu.Lock()
+			success++
+			mu.Unlock()
+		}()
+	}
+	wg.Wait()
+	slog.Info("cloud limit cache: refresh complete",
+		"models_total", len(tags), "cached", success)
+	return success, nil
+}
+
+// StartPeriodicRefresh runs RefreshAll once immediately, then on every
+// interval tick. Cancellation via ctx stops the loop. Logs each
+// outcome; never returns an error to the caller (this is a background
+// task — failures are warnings, not show-stoppers).
+//
+// Typical usage: a goroutine spawned at mort boot.
+//
+//	go cache.StartPeriodicRefresh(ctx, 24*time.Hour)
+func (c *CloudOllamaLimitCache) StartPeriodicRefresh(ctx context.Context, interval time.Duration) {
+	if c == nil {
+		return
+	}
+	if interval <= 0 {
+		interval = 24 * time.Hour
+	}
+	doOne := func() {
+		n, err := c.RefreshAll(ctx)
+		if err != nil {
+			slog.Warn("cloud limit cache: refresh failed",
+				"err", err, "cached_size", c.Size())
+			return
+		}
+		slog.Info("cloud limit cache: refreshed",
+			"newly_cached_or_updated", n, "cached_size", c.Size())
+	}
+	doOne()
+	t := time.NewTicker(interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			doOne()
+		}
+	}
+}
+
+// fetchTags calls GET /api/tags and returns the model names.
+func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error) {
+	url := c.endpoint + "/api/tags"
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return nil, err
+	}
+	c.applyAuth(req)
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	if resp.StatusCode/100 != 2 {
+		return nil, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(body, 400))
+	}
+	var parsed struct {
+		Models []struct {
+			Name  string `json:"name"`
+			Model string `json:"model"`
+		} `json:"models"`
+	}
+	if err := json.Unmarshal(body, &parsed); err != nil {
+		return nil, fmt.Errorf("parse /api/tags: %w", err)
+	}
+	out := make([]string, 0, len(parsed.Models))
+	for _, m := range parsed.Models {
+		name := m.Name
+		if name == "" {
+			name = m.Model
+		}
+		if name == "" {
+			continue
+		}
+		out = append(out, name)
+	}
+	return out, nil
+}
+
+// fetchContextLength calls POST /api/show for a model and extracts
+// the largest *.context_length value from model_info. Returns the
+// length and nil on success; (0, err) on any failure.
+//
+// Why "largest" rather than family-keyed: the family field in the
+// /api/show response is sometimes empty or doesn't match the
+// model_info key prefix exactly (Ollama Cloud returns the
+// architecture as the prefix, which usually but not always matches
+// `family`). Scanning for any `*.context_length` is robust.
+func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelName string) (int, error) {
+	url := c.endpoint + "/api/show"
+	body, _ := json.Marshal(map[string]string{"name": modelName})
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
+	if err != nil {
+		return 0, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	c.applyAuth(req)
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return 0, err
+	}
+	defer resp.Body.Close()
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return 0, err
+	}
+	if resp.StatusCode/100 != 2 {
+		return 0, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(respBody, 400))
+	}
+	n, err := parseContextLengthJSON(respBody)
+	if err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// parseContextLengthJSON extracts the largest `*.context_length` int
+// from an /api/show response body. Exported-ish (lowercase but tested
+// in the same package) so the unit test can exercise it without
+// spinning up an httptest server.
+func parseContextLengthJSON(body []byte) (int, error) {
+	var parsed struct {
+		ModelInfo map[string]any `json:"model_info"`
+	}
+	if err := json.Unmarshal(body, &parsed); err != nil {
+		return 0, fmt.Errorf("parse: %w", err)
+	}
+	best := 0
+	for k, v := range parsed.ModelInfo {
+		if !strings.HasSuffix(k, ".context_length") {
+			continue
+		}
+		n := toInt(v)
+		if n > best {
+			best = n
+		}
+	}
+	if best <= 0 {
+		return 0, fmt.Errorf("no context_length in model_info")
+	}
+	return best, nil
+}
+
+// toInt coerces a JSON-decoded value to int. Handles float64 (the
+// json default) and json.Number; returns 0 for anything else.
+func toInt(v any) int {
+	switch x := v.(type) {
+	case float64:
+		return int(x)
+	case int:
+		return x
+	case int64:
+		return int(x)
+	case json.Number:
+		if n, err := x.Int64(); err == nil {
+			return int(n)
+		}
+	}
+	return 0
+}
+
+// applyAuth sets the Bearer token when an API key is configured.
+func (c *CloudOllamaLimitCache) applyAuth(req *http.Request) {
+	if strings.TrimSpace(c.apiKey) == "" {
+		return
+	}
+	req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(c.apiKey))
+}
+
+// stripCloudPrefix strips an "ollama-cloud/" prefix (and surrounding
+// whitespace). Returns the bare model:tag form.
+func stripCloudPrefix(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasPrefix(s, "ollama-cloud/") {
+		s = s[len("ollama-cloud/"):]
+	}
+	return s
+}
+
+// truncate caps a byte slice for error messages.
+func truncate(b []byte, n int) string {
+	if len(b) <= n {
+		return string(b)
+	}
+	return string(b[:n]) + "...(truncated)"
+}