go-llm/v2/request.go

package llm

// RequestOption configures a single completion request.
type RequestOption func(*requestConfig)

type requestConfig struct {
	tools       *ToolBox
	temperature *float64
	maxTokens   *int
	topP        *float64
	stop        []string
	cacheConfig *cacheConfig
	reasoning   ReasoningLevel
}

// ReasoningLevel selects how much reasoning effort/budget the provider should
// spend before answering. Empty string is the default (no reasoning, identical
// to historical behavior). Each provider translates this to its native
// parameter; models that don't support reasoning silently ignore it.
type ReasoningLevel string

const (
	// ReasoningLow asks for a small amount of extra reasoning. Maps to
	// reasoning_effort="low" on OpenAI/xAI, ~1k thinking budget on
	// Anthropic/Google.
	ReasoningLow ReasoningLevel = "low"

	// ReasoningMedium asks for a moderate amount. Maps to reasoning_effort
	// ="medium" on OpenAI, ~8k thinking budget on Anthropic/Google. xAI
	// remaps medium to its only-other-option, "high".
	ReasoningMedium ReasoningLevel = "medium"

	// ReasoningHigh asks for the most reasoning the provider exposes.
	// Maps to reasoning_effort="high" on OpenAI/xAI, ~24k thinking budget
	// on Anthropic/Google.
	ReasoningHigh ReasoningLevel = "high"
)

// cacheConfig holds prompt-caching settings. nil = disabled.
type cacheConfig struct {
	enabled bool
}

// WithTools attaches a toolbox to the request.
func WithTools(tb *ToolBox) RequestOption {
	return func(c *requestConfig) { c.tools = tb }
}

// WithTemperature sets the sampling temperature.
func WithTemperature(t float64) RequestOption {
	return func(c *requestConfig) { c.temperature = &t }
}

// WithMaxTokens sets the maximum number of tokens to generate.
func WithMaxTokens(n int) RequestOption {
	return func(c *requestConfig) { c.maxTokens = &n }
}

// WithTopP sets the nucleus sampling parameter.
func WithTopP(p float64) RequestOption {
	return func(c *requestConfig) { c.topP = &p }
}

// WithStop sets stop sequences.
func WithStop(sequences ...string) RequestOption {
	return func(c *requestConfig) { c.stop = sequences }
}

// WithReasoning asks the model to spend extra reasoning budget on the
// response. Each provider maps the level to its native shape:
//
//   - Anthropic: thinking.budget_tokens (low ~ 1024, medium ~ 8000, high ~ 24000)
//   - OpenAI / xAI / Groq: reasoning_effort string (xAI remaps medium to high)
//   - Google: thinking_config.thinking_budget (same budget as Anthropic)
//   - DeepSeek (reasoner): always-on regardless; this option is a no-op
//   - Models without reasoning support: silently ignored
//
// Reasoning content (when surfaced by the provider) appears on
// Response.Thinking, and is also streamed as StreamEventThinking events.
func WithReasoning(level ReasoningLevel) RequestOption {
	return func(c *requestConfig) { c.reasoning = level }
}

// WithPromptCaching enables automatic prompt-caching markers on providers
// that support it (currently Anthropic). On providers that don't support
// explicit cache markers (OpenAI, Google), this is a no-op.
//
// When enabled, the library places cache breakpoints at natural seams:
//   - the last tool definition (caches all tools)
//   - the last system message (caches tools + system)
//   - the last non-system message in the history (caches tools + system +
//     conversation so far)
//
// Breakpoints are placed only when the corresponding section is non-empty.
// Up to 3 markers are emitted per request, leaving one of Anthropic's 4
// marker slots for future use.
//
// Cache hits give a 90% discount on cached input tokens (5-minute ephemeral
// tier). Cache writes cost 25% more than normal input tokens, so this option
// is only worth enabling for prompts whose cacheable prefix exceeds the
// minimum (1024 tokens on Opus/Sonnet, 2048 tokens on Haiku) AND is re-sent
// at least twice within the 5-minute TTL.
func WithPromptCaching() RequestOption {
	return func(c *requestConfig) {
		c.cacheConfig = &cacheConfig{enabled: true}
	}
}