executus/model/context_limits.go

// V15.2 — per-model context-window limits.
//
// Why: agents need to know when they're about to blow the model's
// max-input cap so they can compact stale tool results out of the
// message history. Pre-15.2 the agent loop had no awareness; a long
// research run that accumulated dozens of HTTP tool results would
// hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar
// error mid-run with no graceful degradation.
//
// Coverage:
//   - Anthropic Claude 4.x (200K default; 1M when the model ID
//     includes the "[1m]" suffix per llms.tier reload conventions)
//   - OpenAI GPT-4.x / o-series (128K)
//   - Gemini 2.x (1M-2M, model-specific)
//   - Ollama Cloud (model-specific; hardcoded per known model)
//   - Local Ollama: queries `/api/show` once at first use, caches
//
// Returns (0, false) for unknown models — callers should treat
// "unknown" as "don't budget" (the agent's existing iteration cap +
// timeout are the fallback safety nets).

package model

import (
	"context"
	"strings"
	"sync"
)

// MaxContextTokens returns the model's max INPUT context-window size
// in tokens. Output / response tokens are NOT included — most models
// share input + output budget but cap them separately, and the practical
// concern is "how big can my prompt get before the model rejects".
//
// modelID accepts both the bare model name (`claude-sonnet-4-6`) and
// the prefixed form (`anthropic/claude-sonnet-4-6` or
// `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup.
//
// Returns (limit, true) on a known model; (0, false) otherwise.
//
// This function is pure (no I/O). For Ollama Cloud models that aren't
// in the static map, use MaxContextTokensWithCache which consults a
// CloudOllamaLimitCache populated at boot from /api/tags + /api/show.
func MaxContextTokens(modelID string) (int, bool) {
	id := normalizeModelID(modelID)
	if v, ok := staticContextLimits[id]; ok {
		return v, true
	}
	// Anthropic 1M-context variant marker. Mort's llms tier system
	// uses a `[1m]` suffix on the model ID (e.g.
	// `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context.
	if strings.HasSuffix(id, "[1m]") {
		return 1_000_000, true
	}
	// Local-ollama dynamic lookup is wired separately so it can
	// query the daemon's /api/show endpoint. The static map covers
	// known cloud models.
	return 0, false
}

// MaxContextTokensWithCache is the cache-aware variant of
// MaxContextTokens. It tries the static map first; on miss, if the
// model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it
// consults the supplied CloudOllamaLimitCache. Pass nil cache for
// static-only behaviour (equivalent to MaxContextTokens).
//
// This function never makes HTTP calls — the cache must be
// pre-populated (typically via cache.RefreshAll at boot). Callers in
// the hot path can rely on a single map lookup per call. Prefer
// MaxContextTokensResolving when a context is available — it makes a
// single /api/show call to fill the cache on miss, which is essential
// for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud).
func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
	if v, ok := MaxContextTokens(modelID); ok {
		return v, true
	}
	if cloud == nil {
		return 0, false
	}
	// Only ollama-cloud/* models are eligible for the cache.
	id := strings.TrimSpace(modelID)
	if !strings.HasPrefix(id, "ollama-cloud/") {
		// Also allow bare model:tag form when the caller has already
		// stripped the prefix (some test paths).
		if strings.Contains(id, "/") {
			return 0, false
		}
	}
	return cloud.Lookup(id)
}

// MaxContextTokensResolving is the cache-aware variant that ALSO
// performs a live /api/show fetch on cache miss (with negative caching
// to prevent thrash). Use this in run-setup paths where one HTTP call
// per unseen model is acceptable — typically the skill executor's
// compaction threshold computation. The fetched result is cached for
// future calls, so subsequent runs hit the in-memory map.
//
// Falls back to the static-only path when the model isn't an
// ollama-cloud/* spec or cache is nil. ctx cancellation aborts the
// fetch and returns (0, false) without writing a negative entry.
func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
	if v, ok := MaxContextTokens(modelID); ok {
		return v, true
	}
	if cloud == nil {
		return 0, false
	}
	id := strings.TrimSpace(modelID)
	if !strings.HasPrefix(id, "ollama-cloud/") {
		if strings.Contains(id, "/") {
			return 0, false
		}
	}
	return cloud.LookupOrFetch(ctx, id)
}

// normalizeModelID strips provider prefix and reasoning suffix so a
// lookup keyed on the base name works regardless of caller form.
//
// Examples:
//   - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
//   - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b"
//   - "claude-opus-4-7:high" → "claude-opus-4-7"
func normalizeModelID(id string) string {
	id = strings.TrimSpace(id)
	if idx := strings.Index(id, "/"); idx >= 0 {
		id = id[idx+1:]
	}
	// Strip :low/:medium/:high reasoning effort suffix used by some
	// OpenAI / Anthropic clients.
	for _, suffix := range []string{":low", ":medium", ":high"} {
		if strings.HasSuffix(id, suffix) {
			id = id[:len(id)-len(suffix)]
			break
		}
	}
	return id
}

// staticContextLimits is the source of truth for known cloud models.
// Add new entries when adding a model to the llms tier system.
//
// CRITICAL: keep these in sync with the actual provider docs. A wrong
// number here causes EITHER premature compaction (too low, degrades
// agent quality unnecessarily) OR HTTP 400 mid-run (too high). The
// 410K-token failure on `qwen3-coder:480b` is the kind of bug a
// mistyped value would reintroduce.
var staticContextLimits = map[string]int{
	// Anthropic Claude 4.x — default 200K input. 1M variant via
	// `[1m]` suffix handled in MaxContextTokens above.
	"claude-opus-4-7":           200_000,
	"claude-opus-4-6":           200_000,
	"claude-opus-4-5":           200_000,
	"claude-sonnet-4-6":         200_000,
	"claude-sonnet-4-5":         200_000,
	"claude-haiku-4-5":          200_000,
	"claude-haiku-4-5-20251001": 200_000,

	// OpenAI GPT-4.x / o-series — 128K input.
	"gpt-4o":      128_000,
	"gpt-4o-mini": 128_000,
	"gpt-4-turbo": 128_000,
	"o1":          200_000,
	"o1-mini":     128_000,
	"o3-mini":     200_000,
	"gpt-5":       400_000,
	"gpt-5-mini":  400_000,

	// Gemini — varies dramatically by model.
	"gemini-2.5-pro":        2_000_000,
	"gemini-2.5-flash":      1_000_000,
	"gemini-2.5-flash-lite": 1_000_000,
	"gemini-1.5-pro":        2_000_000,
	"gemini-1.5-flash":      1_000_000,

	// Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models
	// — verified against the Ollama API show output for each model.
	// Update when Ollama publishes new models or extends contexts.
	"qwen3-coder:480b":   262_144, // 262K — matches the v15.2 trace
	"qwen3:235b":         262_144,
	"qwen3:32b":          131_072,
	"qwen2.5:72b":        131_072,
	"gpt-oss:120b":       131_072,
	"gpt-oss:20b":        131_072,
	"deepseek-v3.1:671b": 131_072,
	"glm-4.6:355b":       131_072,
	"kimi-k2:1t":         262_144,
	"llama4:scout":       10_000_000, // Llama 4 Scout claims 10M
	"llama4:maverick":    1_000_000,
}

// LocalOllamaLimitCache holds the resolved /api/show context_length per
// local-ollama model. Populated on first lookup; never invalidated
// (changing num_ctx requires an ollama restart anyway). Process-wide,
// no per-tenant scoping needed.
type LocalOllamaLimitCache struct {
	mu    sync.RWMutex
	limit map[string]int
}

// NewLocalOllamaLimitCache constructs a fresh cache.
func NewLocalOllamaLimitCache() *LocalOllamaLimitCache {
	return &LocalOllamaLimitCache{limit: make(map[string]int)}
}

// Get returns the cached limit or (0, false) when unseen. The caller
// is expected to follow up with a lookup against the live daemon.
func (c *LocalOllamaLimitCache) Get(model string) (int, bool) {
	c.mu.RLock()
	defer c.mu.RUnlock()
	v, ok := c.limit[model]
	return v, ok
}

// Set records a resolved limit. Idempotent; no-op when value is <= 0.
func (c *LocalOllamaLimitCache) Set(model string, n int) {
	if n <= 0 {
		return
	}
	c.mu.Lock()
	defer c.mu.Unlock()
	c.limit[model] = n
}