// V15.2 — per-model context-window limits. // // Why: agents need to know when they're about to blow the model's // max-input cap so they can compact stale tool results out of the // message history. Pre-15.2 the agent loop had no awareness; a long // research run that accumulated dozens of HTTP tool results would // hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar // error mid-run with no graceful degradation. // // Coverage: // - Anthropic Claude 4.x (200K default; 1M when the model ID // includes the "[1m]" suffix per llms.tier reload conventions) // - OpenAI GPT-4.x / o-series (128K) // - Gemini 2.x (1M-2M, model-specific) // - Ollama Cloud (model-specific; hardcoded per known model) // - Local Ollama: queries `/api/show` once at first use, caches // // Returns (0, false) for unknown models — callers should treat // "unknown" as "don't budget" (the agent's existing iteration cap + // timeout are the fallback safety nets). package model import ( "context" "strings" "sync" ) // MaxContextTokens returns the model's max INPUT context-window size // in tokens. Output / response tokens are NOT included — most models // share input + output budget but cap them separately, and the practical // concern is "how big can my prompt get before the model rejects". // // modelID accepts both the bare model name (`claude-sonnet-4-6`) and // the prefixed form (`anthropic/claude-sonnet-4-6` or // `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup. // // Returns (limit, true) on a known model; (0, false) otherwise. // // This function is pure (no I/O). For Ollama Cloud models that aren't // in the static map, use MaxContextTokensWithCache which consults a // CloudOllamaLimitCache populated at boot from /api/tags + /api/show. func MaxContextTokens(modelID string) (int, bool) { id := normalizeModelID(modelID) if v, ok := staticContextLimits[id]; ok { return v, true } // Anthropic 1M-context variant marker. Mort's llms tier system // uses a `[1m]` suffix on the model ID (e.g. // `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context. if strings.HasSuffix(id, "[1m]") { return 1_000_000, true } // Local-ollama dynamic lookup is wired separately so it can // query the daemon's /api/show endpoint. The static map covers // known cloud models. return 0, false } // MaxContextTokensWithCache is the cache-aware variant of // MaxContextTokens. It tries the static map first; on miss, if the // model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it // consults the supplied CloudOllamaLimitCache. Pass nil cache for // static-only behaviour (equivalent to MaxContextTokens). // // This function never makes HTTP calls — the cache must be // pre-populated (typically via cache.RefreshAll at boot). Callers in // the hot path can rely on a single map lookup per call. Prefer // MaxContextTokensResolving when a context is available — it makes a // single /api/show call to fill the cache on miss, which is essential // for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud). func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) { if v, ok := MaxContextTokens(modelID); ok { return v, true } if cloud == nil { return 0, false } // Only ollama-cloud/* models are eligible for the cache. id := strings.TrimSpace(modelID) if !strings.HasPrefix(id, "ollama-cloud/") { // Also allow bare model:tag form when the caller has already // stripped the prefix (some test paths). if strings.Contains(id, "/") { return 0, false } } return cloud.Lookup(id) } // MaxContextTokensResolving is the cache-aware variant that ALSO // performs a live /api/show fetch on cache miss (with negative caching // to prevent thrash). Use this in run-setup paths where one HTTP call // per unseen model is acceptable — typically the skill executor's // compaction threshold computation. The fetched result is cached for // future calls, so subsequent runs hit the in-memory map. // // Falls back to the static-only path when the model isn't an // ollama-cloud/* spec or cache is nil. ctx cancellation aborts the // fetch and returns (0, false) without writing a negative entry. func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) { if v, ok := MaxContextTokens(modelID); ok { return v, true } if cloud == nil { return 0, false } id := strings.TrimSpace(modelID) if !strings.HasPrefix(id, "ollama-cloud/") { if strings.Contains(id, "/") { return 0, false } } return cloud.LookupOrFetch(ctx, id) } // normalizeModelID strips provider prefix and reasoning suffix so a // lookup keyed on the base name works regardless of caller form. // // Examples: // - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6" // - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b" // - "claude-opus-4-7:high" → "claude-opus-4-7" func normalizeModelID(id string) string { id = strings.TrimSpace(id) if idx := strings.Index(id, "/"); idx >= 0 { id = id[idx+1:] } // Strip :low/:medium/:high reasoning effort suffix used by some // OpenAI / Anthropic clients. for _, suffix := range []string{":low", ":medium", ":high"} { if strings.HasSuffix(id, suffix) { id = id[:len(id)-len(suffix)] break } } return id } // staticContextLimits is the source of truth for known cloud models. // Add new entries when adding a model to the llms tier system. // // CRITICAL: keep these in sync with the actual provider docs. A wrong // number here causes EITHER premature compaction (too low, degrades // agent quality unnecessarily) OR HTTP 400 mid-run (too high). The // 410K-token failure on `qwen3-coder:480b` is the kind of bug a // mistyped value would reintroduce. var staticContextLimits = map[string]int{ // Anthropic Claude 4.x — default 200K input. 1M variant via // `[1m]` suffix handled in MaxContextTokens above. "claude-opus-4-7": 200_000, "claude-opus-4-6": 200_000, "claude-opus-4-5": 200_000, "claude-sonnet-4-6": 200_000, "claude-sonnet-4-5": 200_000, "claude-haiku-4-5": 200_000, "claude-haiku-4-5-20251001": 200_000, // OpenAI GPT-4.x / o-series — 128K input. "gpt-4o": 128_000, "gpt-4o-mini": 128_000, "gpt-4-turbo": 128_000, "o1": 200_000, "o1-mini": 128_000, "o3-mini": 200_000, "gpt-5": 400_000, "gpt-5-mini": 400_000, // Gemini — varies dramatically by model. "gemini-2.5-pro": 2_000_000, "gemini-2.5-flash": 1_000_000, "gemini-2.5-flash-lite": 1_000_000, "gemini-1.5-pro": 2_000_000, "gemini-1.5-flash": 1_000_000, // Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models // — verified against the Ollama API show output for each model. // Update when Ollama publishes new models or extends contexts. "qwen3-coder:480b": 262_144, // 262K — matches the v15.2 trace "qwen3:235b": 262_144, "qwen3:32b": 131_072, "qwen2.5:72b": 131_072, "gpt-oss:120b": 131_072, "gpt-oss:20b": 131_072, "deepseek-v3.1:671b": 131_072, "glm-4.6:355b": 131_072, "kimi-k2:1t": 262_144, "llama4:scout": 10_000_000, // Llama 4 Scout claims 10M "llama4:maverick": 1_000_000, } // LocalOllamaLimitCache holds the resolved /api/show context_length per // local-ollama model. Populated on first lookup; never invalidated // (changing num_ctx requires an ollama restart anyway). Process-wide, // no per-tenant scoping needed. type LocalOllamaLimitCache struct { mu sync.RWMutex limit map[string]int } // NewLocalOllamaLimitCache constructs a fresh cache. func NewLocalOllamaLimitCache() *LocalOllamaLimitCache { return &LocalOllamaLimitCache{limit: make(map[string]int)} } // Get returns the cached limit or (0, false) when unseen. The caller // is expected to follow up with a lookup against the live daemon. func (c *LocalOllamaLimitCache) Get(model string) (int, bool) { c.mu.RLock() defer c.mu.RUnlock() v, ok := c.limit[model] return v, ok } // Set records a resolved limit. Idempotent; no-op when value is <= 0. func (c *LocalOllamaLimitCache) Set(model string, n int) { if n <= 0 { return } c.mu.Lock() defer c.mu.Unlock() c.limit[model] = n }