b424261aca
Lifts mort's pkg/logic/llms into executus/model, decoupled from mort: - tiers.go: the tier resolver now reads a host-supplied config.Source under "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults, ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the resolution mechanism (cache, reasoning-suffix dialect, chain validation) is generic. No tier names hard-coded in the harness. - sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into UsageSink / TraceSink seams + a model-owned Span, with nil-safe context attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser). Both sinks optional (nil = off) so a light host records nothing. - lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf. - call.go keeps GenerateWith[T] (instrumented structured output) — this is the structured-output primitive; no separate structured/ package. - llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry + ledger). Its tests configure a minimal tier table via TestMain. New tests cover the inversion: config overrides fallback, tier registration, reasoning-suffix survival, nested-tier rejection, nil-sink no-ops. Full module: go build/vet/test -race green; core go.sum still free of gorm/redis/discordgo/sqlite. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
225 lines
8.1 KiB
Go
225 lines
8.1 KiB
Go
// V15.2 — per-model context-window limits.
|
|
//
|
|
// Why: agents need to know when they're about to blow the model's
|
|
// max-input cap so they can compact stale tool results out of the
|
|
// message history. Pre-15.2 the agent loop had no awareness; a long
|
|
// research run that accumulated dozens of HTTP tool results would
|
|
// hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar
|
|
// error mid-run with no graceful degradation.
|
|
//
|
|
// Coverage:
|
|
// - Anthropic Claude 4.x (200K default; 1M when the model ID
|
|
// includes the "[1m]" suffix per llms.tier reload conventions)
|
|
// - OpenAI GPT-4.x / o-series (128K)
|
|
// - Gemini 2.x (1M-2M, model-specific)
|
|
// - Ollama Cloud (model-specific; hardcoded per known model)
|
|
// - Local Ollama: queries `/api/show` once at first use, caches
|
|
//
|
|
// Returns (0, false) for unknown models — callers should treat
|
|
// "unknown" as "don't budget" (the agent's existing iteration cap +
|
|
// timeout are the fallback safety nets).
|
|
|
|
package model
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// MaxContextTokens returns the model's max INPUT context-window size
|
|
// in tokens. Output / response tokens are NOT included — most models
|
|
// share input + output budget but cap them separately, and the practical
|
|
// concern is "how big can my prompt get before the model rejects".
|
|
//
|
|
// modelID accepts both the bare model name (`claude-sonnet-4-6`) and
|
|
// the prefixed form (`anthropic/claude-sonnet-4-6` or
|
|
// `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup.
|
|
//
|
|
// Returns (limit, true) on a known model; (0, false) otherwise.
|
|
//
|
|
// This function is pure (no I/O). For Ollama Cloud models that aren't
|
|
// in the static map, use MaxContextTokensWithCache which consults a
|
|
// CloudOllamaLimitCache populated at boot from /api/tags + /api/show.
|
|
func MaxContextTokens(modelID string) (int, bool) {
|
|
id := normalizeModelID(modelID)
|
|
if v, ok := staticContextLimits[id]; ok {
|
|
return v, true
|
|
}
|
|
// Anthropic 1M-context variant marker. Mort's llms tier system
|
|
// uses a `[1m]` suffix on the model ID (e.g.
|
|
// `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context.
|
|
if strings.HasSuffix(id, "[1m]") {
|
|
return 1_000_000, true
|
|
}
|
|
// Local-ollama dynamic lookup is wired separately so it can
|
|
// query the daemon's /api/show endpoint. The static map covers
|
|
// known cloud models.
|
|
return 0, false
|
|
}
|
|
|
|
// MaxContextTokensWithCache is the cache-aware variant of
|
|
// MaxContextTokens. It tries the static map first; on miss, if the
|
|
// model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it
|
|
// consults the supplied CloudOllamaLimitCache. Pass nil cache for
|
|
// static-only behaviour (equivalent to MaxContextTokens).
|
|
//
|
|
// This function never makes HTTP calls — the cache must be
|
|
// pre-populated (typically via cache.RefreshAll at boot). Callers in
|
|
// the hot path can rely on a single map lookup per call. Prefer
|
|
// MaxContextTokensResolving when a context is available — it makes a
|
|
// single /api/show call to fill the cache on miss, which is essential
|
|
// for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud).
|
|
func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
|
|
if v, ok := MaxContextTokens(modelID); ok {
|
|
return v, true
|
|
}
|
|
if cloud == nil {
|
|
return 0, false
|
|
}
|
|
// Only ollama-cloud/* models are eligible for the cache.
|
|
id := strings.TrimSpace(modelID)
|
|
if !strings.HasPrefix(id, "ollama-cloud/") {
|
|
// Also allow bare model:tag form when the caller has already
|
|
// stripped the prefix (some test paths).
|
|
if strings.Contains(id, "/") {
|
|
return 0, false
|
|
}
|
|
}
|
|
return cloud.Lookup(id)
|
|
}
|
|
|
|
// MaxContextTokensResolving is the cache-aware variant that ALSO
|
|
// performs a live /api/show fetch on cache miss (with negative caching
|
|
// to prevent thrash). Use this in run-setup paths where one HTTP call
|
|
// per unseen model is acceptable — typically the skill executor's
|
|
// compaction threshold computation. The fetched result is cached for
|
|
// future calls, so subsequent runs hit the in-memory map.
|
|
//
|
|
// Falls back to the static-only path when the model isn't an
|
|
// ollama-cloud/* spec or cache is nil. ctx cancellation aborts the
|
|
// fetch and returns (0, false) without writing a negative entry.
|
|
func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
|
|
if v, ok := MaxContextTokens(modelID); ok {
|
|
return v, true
|
|
}
|
|
if cloud == nil {
|
|
return 0, false
|
|
}
|
|
id := strings.TrimSpace(modelID)
|
|
if !strings.HasPrefix(id, "ollama-cloud/") {
|
|
if strings.Contains(id, "/") {
|
|
return 0, false
|
|
}
|
|
}
|
|
return cloud.LookupOrFetch(ctx, id)
|
|
}
|
|
|
|
// normalizeModelID strips provider prefix and reasoning suffix so a
|
|
// lookup keyed on the base name works regardless of caller form.
|
|
//
|
|
// Examples:
|
|
// - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
|
|
// - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b"
|
|
// - "claude-opus-4-7:high" → "claude-opus-4-7"
|
|
func normalizeModelID(id string) string {
|
|
id = strings.TrimSpace(id)
|
|
if idx := strings.Index(id, "/"); idx >= 0 {
|
|
id = id[idx+1:]
|
|
}
|
|
// Strip :low/:medium/:high reasoning effort suffix used by some
|
|
// OpenAI / Anthropic clients.
|
|
for _, suffix := range []string{":low", ":medium", ":high"} {
|
|
if strings.HasSuffix(id, suffix) {
|
|
id = id[:len(id)-len(suffix)]
|
|
break
|
|
}
|
|
}
|
|
return id
|
|
}
|
|
|
|
// staticContextLimits is the source of truth for known cloud models.
|
|
// Add new entries when adding a model to the llms tier system.
|
|
//
|
|
// CRITICAL: keep these in sync with the actual provider docs. A wrong
|
|
// number here causes EITHER premature compaction (too low, degrades
|
|
// agent quality unnecessarily) OR HTTP 400 mid-run (too high). The
|
|
// 410K-token failure on `qwen3-coder:480b` is the kind of bug a
|
|
// mistyped value would reintroduce.
|
|
var staticContextLimits = map[string]int{
|
|
// Anthropic Claude 4.x — default 200K input. 1M variant via
|
|
// `[1m]` suffix handled in MaxContextTokens above.
|
|
"claude-opus-4-7": 200_000,
|
|
"claude-opus-4-6": 200_000,
|
|
"claude-opus-4-5": 200_000,
|
|
"claude-sonnet-4-6": 200_000,
|
|
"claude-sonnet-4-5": 200_000,
|
|
"claude-haiku-4-5": 200_000,
|
|
"claude-haiku-4-5-20251001": 200_000,
|
|
|
|
// OpenAI GPT-4.x / o-series — 128K input.
|
|
"gpt-4o": 128_000,
|
|
"gpt-4o-mini": 128_000,
|
|
"gpt-4-turbo": 128_000,
|
|
"o1": 200_000,
|
|
"o1-mini": 128_000,
|
|
"o3-mini": 200_000,
|
|
"gpt-5": 400_000,
|
|
"gpt-5-mini": 400_000,
|
|
|
|
// Gemini — varies dramatically by model.
|
|
"gemini-2.5-pro": 2_000_000,
|
|
"gemini-2.5-flash": 1_000_000,
|
|
"gemini-2.5-flash-lite": 1_000_000,
|
|
"gemini-1.5-pro": 2_000_000,
|
|
"gemini-1.5-flash": 1_000_000,
|
|
|
|
// Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models
|
|
// — verified against the Ollama API show output for each model.
|
|
// Update when Ollama publishes new models or extends contexts.
|
|
"qwen3-coder:480b": 262_144, // 262K — matches the v15.2 trace
|
|
"qwen3:235b": 262_144,
|
|
"qwen3:32b": 131_072,
|
|
"qwen2.5:72b": 131_072,
|
|
"gpt-oss:120b": 131_072,
|
|
"gpt-oss:20b": 131_072,
|
|
"deepseek-v3.1:671b": 131_072,
|
|
"glm-4.6:355b": 131_072,
|
|
"kimi-k2:1t": 262_144,
|
|
"llama4:scout": 10_000_000, // Llama 4 Scout claims 10M
|
|
"llama4:maverick": 1_000_000,
|
|
}
|
|
|
|
// LocalOllamaLimitCache holds the resolved /api/show context_length per
|
|
// local-ollama model. Populated on first lookup; never invalidated
|
|
// (changing num_ctx requires an ollama restart anyway). Process-wide,
|
|
// no per-tenant scoping needed.
|
|
type LocalOllamaLimitCache struct {
|
|
mu sync.RWMutex
|
|
limit map[string]int
|
|
}
|
|
|
|
// NewLocalOllamaLimitCache constructs a fresh cache.
|
|
func NewLocalOllamaLimitCache() *LocalOllamaLimitCache {
|
|
return &LocalOllamaLimitCache{limit: make(map[string]int)}
|
|
}
|
|
|
|
// Get returns the cached limit or (0, false) when unseen. The caller
|
|
// is expected to follow up with a lookup against the live daemon.
|
|
func (c *LocalOllamaLimitCache) Get(model string) (int, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
v, ok := c.limit[model]
|
|
return v, ok
|
|
}
|
|
|
|
// Set records a resolved limit. Idempotent; no-op when value is <= 0.
|
|
func (c *LocalOllamaLimitCache) Set(model string, n int) {
|
|
if n <= 0 {
|
|
return
|
|
}
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
c.limit[model] = n
|
|
}
|