package llm // RequestOption configures a single completion request. type RequestOption func(*requestConfig) type requestConfig struct { tools *ToolBox temperature *float64 maxTokens *int topP *float64 stop []string cacheConfig *cacheConfig reasoning ReasoningLevel } // ReasoningLevel selects how much reasoning effort/budget the provider should // spend before answering. Empty string is the default (no reasoning, identical // to historical behavior). Each provider translates this to its native // parameter; models that don't support reasoning silently ignore it. type ReasoningLevel string const ( // ReasoningLow asks for a small amount of extra reasoning. Maps to // reasoning_effort="low" on OpenAI/xAI, ~1k thinking budget on // Anthropic/Google. ReasoningLow ReasoningLevel = "low" // ReasoningMedium asks for a moderate amount. Maps to reasoning_effort // ="medium" on OpenAI, ~8k thinking budget on Anthropic/Google. xAI // remaps medium to its only-other-option, "high". ReasoningMedium ReasoningLevel = "medium" // ReasoningHigh asks for the most reasoning the provider exposes. // Maps to reasoning_effort="high" on OpenAI/xAI, ~24k thinking budget // on Anthropic/Google. ReasoningHigh ReasoningLevel = "high" ) // cacheConfig holds prompt-caching settings. nil = disabled. type cacheConfig struct { enabled bool } // WithTools attaches a toolbox to the request. func WithTools(tb *ToolBox) RequestOption { return func(c *requestConfig) { c.tools = tb } } // WithTemperature sets the sampling temperature. func WithTemperature(t float64) RequestOption { return func(c *requestConfig) { c.temperature = &t } } // WithMaxTokens sets the maximum number of tokens to generate. func WithMaxTokens(n int) RequestOption { return func(c *requestConfig) { c.maxTokens = &n } } // WithTopP sets the nucleus sampling parameter. func WithTopP(p float64) RequestOption { return func(c *requestConfig) { c.topP = &p } } // WithStop sets stop sequences. func WithStop(sequences ...string) RequestOption { return func(c *requestConfig) { c.stop = sequences } } // WithReasoning asks the model to spend extra reasoning budget on the // response. Each provider maps the level to its native shape: // // - Anthropic: thinking.budget_tokens (low ~ 1024, medium ~ 8000, high ~ 24000) // - OpenAI / xAI / Groq: reasoning_effort string (xAI remaps medium to high) // - Google: thinking_config.thinking_budget (same budget as Anthropic) // - DeepSeek (reasoner): always-on regardless; this option is a no-op // - Models without reasoning support: silently ignored // // Reasoning content (when surfaced by the provider) appears on // Response.Thinking, and is also streamed as StreamEventThinking events. func WithReasoning(level ReasoningLevel) RequestOption { return func(c *requestConfig) { c.reasoning = level } } // WithPromptCaching enables automatic prompt-caching markers on providers // that support it (currently Anthropic). On providers that don't support // explicit cache markers (OpenAI, Google), this is a no-op. // // When enabled, the library places cache breakpoints at natural seams: // - the last tool definition (caches all tools) // - the last system message (caches tools + system) // - the last non-system message in the history (caches tools + system + // conversation so far) // // Breakpoints are placed only when the corresponding section is non-empty. // Up to 3 markers are emitted per request, leaving one of Anthropic's 4 // marker slots for future use. // // Cache hits give a 90% discount on cached input tokens (5-minute ephemeral // tier). Cache writes cost 25% more than normal input tokens, so this option // is only worth enabling for prompts whose cacheable prefix exceeds the // minimum (1024 tokens on Opus/Sonnet, 2048 tokens on Haiku) AND is re-sent // at least twice within the 5-minute TTL. func WithPromptCaching() RequestOption { return func(c *requestConfig) { c.cacheConfig = &cacheConfig{enabled: true} } }