Files
executus/model/context_limits.go
steve b424261aca
executus CI / test (pull_request) Successful in 58s
Adversarial Review (Gadfly) / review (pull_request) Successful in 26m27s
executus CI / test (push) Successful in 1m2s
P1: model layer (convar->config inversion) + llmmeta
Lifts mort's pkg/logic/llms into executus/model, decoupled from mort:

- tiers.go: the tier resolver now reads a host-supplied config.Source under
  "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults,
  ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the
  resolution mechanism (cache, reasoning-suffix dialect, chain validation) is
  generic. No tier names hard-coded in the harness.
- sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into
  UsageSink / TraceSink seams + a model-owned Span, with nil-safe context
  attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser).
  Both sinks optional (nil = off) so a light host records nothing.
- lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf.
- call.go keeps GenerateWith[T] (instrumented structured output) — this is the
  structured-output primitive; no separate structured/ package.
- llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry
  + ledger). Its tests configure a minimal tier table via TestMain.

New tests cover the inversion: config overrides fallback, tier registration,
reasoning-suffix survival, nested-tier rejection, nil-sink no-ops.

Full module: go build/vet/test -race green; core go.sum still free of
gorm/redis/discordgo/sqlite.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:47:13 -04:00

225 lines
8.1 KiB
Go

// V15.2 — per-model context-window limits.
//
// Why: agents need to know when they're about to blow the model's
// max-input cap so they can compact stale tool results out of the
// message history. Pre-15.2 the agent loop had no awareness; a long
// research run that accumulated dozens of HTTP tool results would
// hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar
// error mid-run with no graceful degradation.
//
// Coverage:
// - Anthropic Claude 4.x (200K default; 1M when the model ID
// includes the "[1m]" suffix per llms.tier reload conventions)
// - OpenAI GPT-4.x / o-series (128K)
// - Gemini 2.x (1M-2M, model-specific)
// - Ollama Cloud (model-specific; hardcoded per known model)
// - Local Ollama: queries `/api/show` once at first use, caches
//
// Returns (0, false) for unknown models — callers should treat
// "unknown" as "don't budget" (the agent's existing iteration cap +
// timeout are the fallback safety nets).
package model
import (
"context"
"strings"
"sync"
)
// MaxContextTokens returns the model's max INPUT context-window size
// in tokens. Output / response tokens are NOT included — most models
// share input + output budget but cap them separately, and the practical
// concern is "how big can my prompt get before the model rejects".
//
// modelID accepts both the bare model name (`claude-sonnet-4-6`) and
// the prefixed form (`anthropic/claude-sonnet-4-6` or
// `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup.
//
// Returns (limit, true) on a known model; (0, false) otherwise.
//
// This function is pure (no I/O). For Ollama Cloud models that aren't
// in the static map, use MaxContextTokensWithCache which consults a
// CloudOllamaLimitCache populated at boot from /api/tags + /api/show.
func MaxContextTokens(modelID string) (int, bool) {
id := normalizeModelID(modelID)
if v, ok := staticContextLimits[id]; ok {
return v, true
}
// Anthropic 1M-context variant marker. Mort's llms tier system
// uses a `[1m]` suffix on the model ID (e.g.
// `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context.
if strings.HasSuffix(id, "[1m]") {
return 1_000_000, true
}
// Local-ollama dynamic lookup is wired separately so it can
// query the daemon's /api/show endpoint. The static map covers
// known cloud models.
return 0, false
}
// MaxContextTokensWithCache is the cache-aware variant of
// MaxContextTokens. It tries the static map first; on miss, if the
// model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it
// consults the supplied CloudOllamaLimitCache. Pass nil cache for
// static-only behaviour (equivalent to MaxContextTokens).
//
// This function never makes HTTP calls — the cache must be
// pre-populated (typically via cache.RefreshAll at boot). Callers in
// the hot path can rely on a single map lookup per call. Prefer
// MaxContextTokensResolving when a context is available — it makes a
// single /api/show call to fill the cache on miss, which is essential
// for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud).
func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
if v, ok := MaxContextTokens(modelID); ok {
return v, true
}
if cloud == nil {
return 0, false
}
// Only ollama-cloud/* models are eligible for the cache.
id := strings.TrimSpace(modelID)
if !strings.HasPrefix(id, "ollama-cloud/") {
// Also allow bare model:tag form when the caller has already
// stripped the prefix (some test paths).
if strings.Contains(id, "/") {
return 0, false
}
}
return cloud.Lookup(id)
}
// MaxContextTokensResolving is the cache-aware variant that ALSO
// performs a live /api/show fetch on cache miss (with negative caching
// to prevent thrash). Use this in run-setup paths where one HTTP call
// per unseen model is acceptable — typically the skill executor's
// compaction threshold computation. The fetched result is cached for
// future calls, so subsequent runs hit the in-memory map.
//
// Falls back to the static-only path when the model isn't an
// ollama-cloud/* spec or cache is nil. ctx cancellation aborts the
// fetch and returns (0, false) without writing a negative entry.
func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
if v, ok := MaxContextTokens(modelID); ok {
return v, true
}
if cloud == nil {
return 0, false
}
id := strings.TrimSpace(modelID)
if !strings.HasPrefix(id, "ollama-cloud/") {
if strings.Contains(id, "/") {
return 0, false
}
}
return cloud.LookupOrFetch(ctx, id)
}
// normalizeModelID strips provider prefix and reasoning suffix so a
// lookup keyed on the base name works regardless of caller form.
//
// Examples:
// - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
// - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b"
// - "claude-opus-4-7:high" → "claude-opus-4-7"
func normalizeModelID(id string) string {
id = strings.TrimSpace(id)
if idx := strings.Index(id, "/"); idx >= 0 {
id = id[idx+1:]
}
// Strip :low/:medium/:high reasoning effort suffix used by some
// OpenAI / Anthropic clients.
for _, suffix := range []string{":low", ":medium", ":high"} {
if strings.HasSuffix(id, suffix) {
id = id[:len(id)-len(suffix)]
break
}
}
return id
}
// staticContextLimits is the source of truth for known cloud models.
// Add new entries when adding a model to the llms tier system.
//
// CRITICAL: keep these in sync with the actual provider docs. A wrong
// number here causes EITHER premature compaction (too low, degrades
// agent quality unnecessarily) OR HTTP 400 mid-run (too high). The
// 410K-token failure on `qwen3-coder:480b` is the kind of bug a
// mistyped value would reintroduce.
var staticContextLimits = map[string]int{
// Anthropic Claude 4.x — default 200K input. 1M variant via
// `[1m]` suffix handled in MaxContextTokens above.
"claude-opus-4-7": 200_000,
"claude-opus-4-6": 200_000,
"claude-opus-4-5": 200_000,
"claude-sonnet-4-6": 200_000,
"claude-sonnet-4-5": 200_000,
"claude-haiku-4-5": 200_000,
"claude-haiku-4-5-20251001": 200_000,
// OpenAI GPT-4.x / o-series — 128K input.
"gpt-4o": 128_000,
"gpt-4o-mini": 128_000,
"gpt-4-turbo": 128_000,
"o1": 200_000,
"o1-mini": 128_000,
"o3-mini": 200_000,
"gpt-5": 400_000,
"gpt-5-mini": 400_000,
// Gemini — varies dramatically by model.
"gemini-2.5-pro": 2_000_000,
"gemini-2.5-flash": 1_000_000,
"gemini-2.5-flash-lite": 1_000_000,
"gemini-1.5-pro": 2_000_000,
"gemini-1.5-flash": 1_000_000,
// Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models
// — verified against the Ollama API show output for each model.
// Update when Ollama publishes new models or extends contexts.
"qwen3-coder:480b": 262_144, // 262K — matches the v15.2 trace
"qwen3:235b": 262_144,
"qwen3:32b": 131_072,
"qwen2.5:72b": 131_072,
"gpt-oss:120b": 131_072,
"gpt-oss:20b": 131_072,
"deepseek-v3.1:671b": 131_072,
"glm-4.6:355b": 131_072,
"kimi-k2:1t": 262_144,
"llama4:scout": 10_000_000, // Llama 4 Scout claims 10M
"llama4:maverick": 1_000_000,
}
// LocalOllamaLimitCache holds the resolved /api/show context_length per
// local-ollama model. Populated on first lookup; never invalidated
// (changing num_ctx requires an ollama restart anyway). Process-wide,
// no per-tenant scoping needed.
type LocalOllamaLimitCache struct {
mu sync.RWMutex
limit map[string]int
}
// NewLocalOllamaLimitCache constructs a fresh cache.
func NewLocalOllamaLimitCache() *LocalOllamaLimitCache {
return &LocalOllamaLimitCache{limit: make(map[string]int)}
}
// Get returns the cached limit or (0, false) when unseen. The caller
// is expected to follow up with a lookup against the live daemon.
func (c *LocalOllamaLimitCache) Get(model string) (int, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
v, ok := c.limit[model]
return v, ok
}
// Set records a resolved limit. Idempotent; no-op when value is <= 0.
func (c *LocalOllamaLimitCache) Set(model string, n int) {
if n <= 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.limit[model] = n
}