feat(v2): add ReasoningLevel option; thinking/reasoning across providers

Introduces an opt-in level-based reasoning toggle (low/medium/high) that each provider translates to its native parameter: - Anthropic: thinking.budget_tokens (1024/8000/24000), with temperature forced to default and MaxTokens auto-grown above the budget. - OpenAI/xAI/Groq via openaicompat: reasoning_effort string, gated by a new Rules.SupportsReasoning predicate so non-reasoning models don't receive the parameter. xAI uses Rules.MapReasoningEffort to remap "medium" to "high" since its API only accepts low|high. - Google: thinking_config.thinking_budget + include_thoughts:true. - DeepSeek: SupportsReasoning=false (reasoner is always-on; the reasoning_content trace was already extracted via openaicompat). Reasoning content is surfaced as Response.Thinking on Complete and as StreamEventThinking deltas during streaming. Provider-side: extracted from Anthropic thinking content blocks, Google's part.Thought=true parts, and the non-standard reasoning_content field that DeepSeek and Groq emit (parsed out of raw JSON since openai-go doesn't type it). Public API: - llm.ReasoningLevel + ReasoningLow/Medium/High constants - llm.WithReasoning(level) request option - Model.WithReasoning(level) for baked-in defaults - provider.Request.Reasoning, provider.Response.Thinking - provider.StreamEventThinking Tests cover Rules-based gating, MapReasoningEffort, reasoning_content extraction (Complete + Stream), Anthropic budget mapping, and temperature suppression when thinking is enabled. Existing behavior is unchanged when Reasoning is the empty string. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-25 03:58:42 +00:00
parent 34119e5a00
commit cbaf41f50c
16 changed files with 602 additions and 32 deletions
@@ -49,10 +49,20 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan
 	resp, err := cl.CreateMessagesStream(ctx, anth.MessagesStreamRequest{
 		MessagesRequest: anthReq,
 		OnContentBlockDelta: func(data anth.MessagesEventContentBlockDeltaData) {
-			if data.Delta.Type == "text_delta" && data.Delta.Text != nil {
-				events <- provider.StreamEvent{
-					Type: provider.StreamEventText,
-					Text: *data.Delta.Text,
+			switch data.Delta.Type {
+			case anth.MessagesContentTypeTextDelta:
+				if data.Delta.Text != nil {
+					events <- provider.StreamEvent{
+						Type: provider.StreamEventText,
+						Text: *data.Delta.Text,
+					}
+				}
+			case anth.MessagesContentTypeThinkingDelta:
+				if data.Delta.MessageContentThinking != nil {
+					events <- provider.StreamEvent{
+						Type: provider.StreamEventThinking,
+						Text: data.Delta.Thinking,
+					}
 				}
 			}
 		},
@@ -71,6 +81,28 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan
 	return nil
 }

+// Thinking budgets used by Anthropic for low/medium/high reasoning levels.
+// Must each be >= 1024 (Anthropic minimum) and strictly less than MaxTokens.
+const (
+	thinkingBudgetLow    = 1024
+	thinkingBudgetMedium = 8000
+	thinkingBudgetHigh   = 24000
+)
+
+// thinkingBudget returns the Anthropic budget_tokens value for a go-llm
+// ReasoningLevel string. Returns 0 to mean "no thinking" / pass-through.
+func thinkingBudget(level string) int {
+	switch level {
+	case "low":
+		return thinkingBudgetLow
+	case "medium":
+		return thinkingBudgetMedium
+	case "high":
+		return thinkingBudgetHigh
+	}
+	return 0
+}
+
 func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
 	anthReq := anth.MessagesRequest{
 		Model:     anth.Model(req.Model),
@@ -81,6 +113,20 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
 		anthReq.MaxTokens = *req.MaxTokens
 	}

+	// Extended thinking. Setting Thinking forces temperature to be unset
+	// (Anthropic only allows the default of 1.0) and requires MaxTokens to
+	// strictly exceed BudgetTokens. We grow MaxTokens if the caller's value
+	// is too small, so callers don't have to reason about budget arithmetic.
+	if budget := thinkingBudget(req.Reasoning); budget > 0 {
+		anthReq.Thinking = &anth.Thinking{
+			Type:         anth.ThinkingTypeEnabled,
+			BudgetTokens: budget,
+		}
+		if anthReq.MaxTokens <= budget {
+			anthReq.MaxTokens = budget + 4096
+		}
+	}
+
 	var msgs []anth.Message
 	var systemText string

@@ -259,7 +305,11 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
 		}
 	}

-	if req.Temperature != nil {
+	// Anthropic rejects a non-default temperature when extended thinking is
+	// enabled. Drop the caller's value silently in that case rather than
+	// erroring — the alternative is forcing every caller to reset
+	// temperature when they enable thinking.
+	if req.Temperature != nil && anthReq.Thinking == nil {
 		f := float32(*req.Temperature)
 		anthReq.Temperature = &f
 	}
@@ -307,6 +357,7 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
 func (p *Provider) convertResponse(resp anth.MessagesResponse) provider.Response {
 	var res provider.Response
 	var textParts []string
+	var thinkingParts []string

 	for _, block := range resp.Content {
 		switch block.Type {
@@ -314,6 +365,10 @@ func (p *Provider) convertResponse(resp anth.MessagesResponse) provider.Response
 			if block.Text != nil {
 				textParts = append(textParts, *block.Text)
 			}
+		case anth.MessagesContentTypeThinking:
+			if block.MessageContentThinking != nil {
+				thinkingParts = append(thinkingParts, block.Thinking)
+			}
 		case anth.MessagesContentTypeToolUse:
 			if block.MessageContentToolUse != nil {
 				args, _ := json.Marshal(block.MessageContentToolUse.Input)
@@ -327,6 +382,7 @@ func (p *Provider) convertResponse(resp anth.MessagesResponse) provider.Response
 	}

 	res.Text = strings.Join(textParts, "")
+	res.Thinking = strings.Join(thinkingParts, "")

 	res.Usage = &provider.Usage{
 		InputTokens:  resp.Usage.InputTokens,