// V15.4 — Ollama Cloud dynamic context-length sync. // // Why: the static map in context_limits.go has to be hand-maintained // for every new Ollama Cloud model. Cloud ships new models monthly, // and a missing entry silently disables compaction for runs on that // model (compactionThresholdForModel returns 0 on MaxContextTokens // miss). Dynamic sync removes the maintenance burden and means new // cloud models work out-of-the-box. // // How: at boot, mort kicks off a CloudOllamaLimitCache.RefreshAll in a // background goroutine. RefreshAll calls /api/tags to list every // available cloud model, then concurrently calls /api/show for each // to extract `.context_length` from the response's model_info // map. The cache is consulted by the executor's // compactionThresholdForModel via the cache-aware // MaxContextTokensWithCache helper. // // Periodic refresh: a daily ticker re-runs RefreshAll so newly // released models surface without a mort restart. The interval is // intentionally not configurable — cloud model context lengths don't // change for a given tag (only the tag pointer can move, e.g. :cloud // → larger model), so daily is conservative. package model import ( "bytes" "context" "encoding/json" "fmt" "io" "log/slog" "net/http" "strings" "sync" "time" ) // defaultCloudEndpoint is the public Ollama Cloud base URL. Override // in tests via NewCloudOllamaLimitCache's endpoint arg. const defaultCloudEndpoint = "https://ollama.com" // CloudOllamaLimitCache holds context-length values for Ollama Cloud // models, populated dynamically via /api/tags + /api/show. Construct // with NewCloudOllamaLimitCache. Safe for concurrent use. // // Empty when OLLAMA_API_KEY is unset — Refresh returns a clear error // and the cache stays empty. Lookups return (0, false) and callers // fall back to the static map / disabled compaction. type CloudOllamaLimitCache struct { mu sync.RWMutex limit map[string]int negative map[string]time.Time // model → fetch-failure time (for TTL) endpoint string apiKey string httpClient *http.Client // refreshConcurrency caps the number of concurrent /api/show calls // during RefreshAll. Default 8 — enough to finish a ~50-model // catalog in well under a minute without hammering Cloud. refreshConcurrency int // negativeTTL is how long a /api/show miss is cached before we // retry. Prevents hammering Cloud on a typo or recently-removed // model. Default 10 minutes. negativeTTL time.Duration } // NewCloudOllamaLimitCache constructs a fresh cache. apiKey can be // empty — RefreshAll then returns an error and the cache stays empty. // endpoint defaults to https://ollama.com when empty. httpClient // defaults to a 15s-timeout client. func NewCloudOllamaLimitCache(endpoint, apiKey string, httpClient *http.Client) *CloudOllamaLimitCache { if strings.TrimSpace(endpoint) == "" { endpoint = defaultCloudEndpoint } endpoint = strings.TrimRight(endpoint, "/") if httpClient == nil { httpClient = &http.Client{Timeout: 15 * time.Second} } return &CloudOllamaLimitCache{ limit: make(map[string]int), negative: make(map[string]time.Time), endpoint: endpoint, apiKey: apiKey, httpClient: httpClient, refreshConcurrency: 8, negativeTTL: 10 * time.Minute, } } // SetNegativeTTL overrides the negative-cache lifetime. Tests use this // to control retry behaviour without sleeping. func (c *CloudOllamaLimitCache) SetNegativeTTL(d time.Duration) { if c == nil || d < 0 { return } c.mu.Lock() c.negativeTTL = d c.mu.Unlock() } // Lookup returns the cached context length for an Ollama Cloud model // name (e.g. "qwen3.5:cloud", "qwen3-coder:480b"). Returns (0, false) // on miss. Lookup never makes HTTP calls — it's the hot path consulted // by the executor before every run. // // modelName accepts either the bare model:tag form or the prefixed // "ollama-cloud/model:tag" form; the prefix is stripped. func (c *CloudOllamaLimitCache) Lookup(modelName string) (int, bool) { if c == nil { return 0, false } key := stripCloudPrefix(modelName) if key == "" { return 0, false } c.mu.RLock() defer c.mu.RUnlock() v, ok := c.limit[key] return v, ok } // Size returns the number of cached entries. Useful for logging / // health checks. func (c *CloudOllamaLimitCache) Size() int { if c == nil { return 0 } c.mu.RLock() defer c.mu.RUnlock() return len(c.limit) } // LookupOrFetch returns the cached context length OR, on miss, makes a // single /api/show call to populate the cache. Negative results // (model not found, /api/show returns no context_length) are cached // for negativeTTL to prevent hammering Cloud on a typo. Returns // (0, false) when the model is genuinely unknown and (size, true) on // any successful resolve. // // Why this exists: Ollama Cloud's /api/tags lists canonical model // names only (e.g. "qwen3.5:397b") but accepts aliases on /api/show // (e.g. "qwen3.5:cloud" → same 397b model). The boot-time RefreshAll // only sees the canonical names, so common aliases miss the cache. // LookupOrFetch fills the gap. // // The cache is therefore self-healing: any unknown model gets one // live /api/show call, the result lands in the cache, and subsequent // runs hit immediately. Periodic RefreshAll overwrites everything // with the canonical-name results but additionally-fetched aliases // linger as positive entries. func (c *CloudOllamaLimitCache) LookupOrFetch(ctx context.Context, modelName string) (int, bool) { if c == nil { return 0, false } key := stripCloudPrefix(modelName) if key == "" { return 0, false } // Fast path: positive hit. c.mu.RLock() if v, ok := c.limit[key]; ok { c.mu.RUnlock() return v, true } // Negative cache check. if t, ok := c.negative[key]; ok && time.Since(t) < c.negativeTTL { c.mu.RUnlock() return 0, false } c.mu.RUnlock() // No API key configured → can't fetch. Don't write a negative // entry (when the key gets configured later we want the next call // to re-try immediately). if strings.TrimSpace(c.apiKey) == "" { return 0, false } // Slow path: live /api/show. n, err := c.fetchContextLength(ctx, key) if err != nil || n <= 0 { slog.Debug("cloud limit cache: lazy fetch miss", "model", key, "err", err) c.mu.Lock() c.negative[key] = time.Now() c.mu.Unlock() return 0, false } c.set(key, n) slog.Info("cloud limit cache: lazy fetch hit", "model", key, "context_length", n) return n, true } // set stores a context length. n <= 0 is a no-op. func (c *CloudOllamaLimitCache) set(modelName string, n int) { if c == nil || n <= 0 { return } key := stripCloudPrefix(modelName) if key == "" { return } c.mu.Lock() c.limit[key] = n c.mu.Unlock() } // RefreshAll queries /api/tags then concurrently calls /api/show for // every listed model, populating the cache. Returns the number of // models successfully cached and the first error encountered (a // /api/tags failure aborts; individual /api/show failures are logged // but don't abort the whole refresh). // // Safe to call repeatedly. Cache entries are overwritten with the // fresh values; entries for models that have been removed from Cloud // are NOT pruned (cheap to keep; pruning risks dropping an entry just // before a run that needs it). func (c *CloudOllamaLimitCache) RefreshAll(ctx context.Context) (int, error) { if c == nil { return 0, fmt.Errorf("cloud limit cache: nil receiver") } if strings.TrimSpace(c.apiKey) == "" { return 0, fmt.Errorf("cloud limit cache: OLLAMA_API_KEY unset") } tags, err := c.fetchTags(ctx) if err != nil { return 0, fmt.Errorf("cloud limit cache: /api/tags: %w", err) } concurrency := c.refreshConcurrency if concurrency <= 0 { concurrency = 8 } sem := make(chan struct{}, concurrency) var wg sync.WaitGroup var ( mu sync.Mutex success int ) for _, name := range tags { name := name wg.Add(1) sem <- struct{}{} go func() { defer wg.Done() defer func() { <-sem }() ctxLen, ferr := c.fetchContextLength(ctx, name) if ferr != nil { slog.Debug("cloud limit cache: /api/show miss", "model", name, "err", ferr) return } c.set(name, ctxLen) mu.Lock() success++ mu.Unlock() }() } wg.Wait() slog.Info("cloud limit cache: refresh complete", "models_total", len(tags), "cached", success) return success, nil } // StartPeriodicRefresh runs RefreshAll once immediately, then on every // interval tick. Cancellation via ctx stops the loop. Logs each // outcome; never returns an error to the caller (this is a background // task — failures are warnings, not show-stoppers). // // Typical usage: a goroutine spawned at mort boot. // // go cache.StartPeriodicRefresh(ctx, 24*time.Hour) func (c *CloudOllamaLimitCache) StartPeriodicRefresh(ctx context.Context, interval time.Duration) { if c == nil { return } if interval <= 0 { interval = 24 * time.Hour } doOne := func() { n, err := c.RefreshAll(ctx) if err != nil { slog.Warn("cloud limit cache: refresh failed", "err", err, "cached_size", c.Size()) return } slog.Info("cloud limit cache: refreshed", "newly_cached_or_updated", n, "cached_size", c.Size()) } doOne() t := time.NewTicker(interval) defer t.Stop() for { select { case <-ctx.Done(): return case <-t.C: doOne() } } } // fetchTags calls GET /api/tags and returns the model names. func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error) { url := c.endpoint + "/api/tags" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, err } c.applyAuth(req) resp, err := c.httpClient.Do(req) if err != nil { return nil, err } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes)) if err != nil { return nil, err } if resp.StatusCode/100 != 2 { return nil, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(body, 400)) } var parsed struct { Models []struct { Name string `json:"name"` Model string `json:"model"` } `json:"models"` } if err := json.Unmarshal(body, &parsed); err != nil { return nil, fmt.Errorf("parse /api/tags: %w", err) } out := make([]string, 0, len(parsed.Models)) for _, m := range parsed.Models { name := m.Name if name == "" { name = m.Model } if name == "" { continue } out = append(out, name) } return out, nil } // fetchContextLength calls POST /api/show for a model and extracts // the largest *.context_length value from model_info. Returns the // length and nil on success; (0, err) on any failure. // // Why "largest" rather than family-keyed: the family field in the // /api/show response is sometimes empty or doesn't match the // model_info key prefix exactly (Ollama Cloud returns the // architecture as the prefix, which usually but not always matches // `family`). Scanning for any `*.context_length` is robust. func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelName string) (int, error) { url := c.endpoint + "/api/show" body, _ := json.Marshal(map[string]string{"name": modelName}) req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)) if err != nil { return 0, err } req.Header.Set("Content-Type", "application/json") c.applyAuth(req) resp, err := c.httpClient.Do(req) if err != nil { return 0, err } defer resp.Body.Close() respBody, err := io.ReadAll(io.LimitReader(resp.Body, maxLimitCacheResponseBytes)) if err != nil { return 0, err } if resp.StatusCode/100 != 2 { return 0, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(respBody, 400)) } n, err := parseContextLengthJSON(respBody) if err != nil { return 0, err } return n, nil } // parseContextLengthJSON extracts the largest `*.context_length` int // from an /api/show response body. Exported-ish (lowercase but tested // in the same package) so the unit test can exercise it without // spinning up an httptest server. func parseContextLengthJSON(body []byte) (int, error) { var parsed struct { ModelInfo map[string]any `json:"model_info"` } if err := json.Unmarshal(body, &parsed); err != nil { return 0, fmt.Errorf("parse: %w", err) } best := 0 for k, v := range parsed.ModelInfo { if !strings.HasSuffix(k, ".context_length") { continue } n := toInt(v) if n > best { best = n } } if best <= 0 { return 0, fmt.Errorf("no context_length in model_info") } return best, nil } // toInt coerces a JSON-decoded value to int. Handles float64 (the // json default) and json.Number; returns 0 for anything else. func toInt(v any) int { switch x := v.(type) { case float64: return int(x) case int: return x case int64: return int(x) case json.Number: if n, err := x.Int64(); err == nil { return int(n) } } return 0 } // applyAuth sets the Bearer token when an API key is configured. func (c *CloudOllamaLimitCache) applyAuth(req *http.Request) { if strings.TrimSpace(c.apiKey) == "" { return } req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(c.apiKey)) } // stripCloudPrefix strips an "ollama-cloud/" prefix (and surrounding // whitespace). Returns the bare model:tag form. func stripCloudPrefix(s string) string { s = strings.TrimSpace(s) if strings.HasPrefix(s, "ollama-cloud/") { s = s[len("ollama-cloud/"):] } return s } // truncate caps a byte slice for error messages. func truncate(b []byte, n int) string { if len(b) <= n { return string(b) } return string(b[:n]) + "...(truncated)" } // maxLimitCacheResponseBytes bounds the ollama.com limit-cache HTTP responses // (/api/tags, /api/show) so a misbehaving endpoint can't stream an unbounded // body before the 15s timeout fires. 1 MiB is far above any real response. const maxLimitCacheResponseBytes = 1 << 20