feat: comprehensive token usage tracking for V2

Add provider-specific usage details, fix streaming usage, and return usage from all high-level APIs (Chat.Send, Generate[T], Agent.Run). Breaking changes: - Chat.Send/SendMessage/SendWithImages now return (string, *Usage, error) - Generate[T]/GenerateWith[T] now return (T, *Usage, error) - Agent.Run/RunMessages now return (string, *Usage, error) New features: - Usage.Details map for provider-specific token breakdowns (reasoning, cached, audio, thoughts tokens) - OpenAI streaming now captures usage via StreamOptions.IncludeUsage - Google streaming now captures UsageMetadata from final chunk - UsageTracker.Details() for accumulated detail totals - ModelPricing and PricingRegistry for cost computation Closes #2 Co-Authored-By: Claude Opus 4.6 <[email protected]>
2026-03-02 04:33:18 +00:00
parent 7e1705c385
commit 5b687839b2
17 changed files with 684 additions and 61 deletions
@@ -18,7 +18,7 @@
 //	        coder.AsTool("code", "Write and run code"),
 //	    )),
 //	)
-//	result, err := orchestrator.Run(ctx, "Build a fibonacci function in Go")
+//	result, _, err := orchestrator.Run(ctx, "Build a fibonacci function in Go")
 package agent

 import (
@@ -64,13 +64,15 @@ func New(model *llm.Model, system string, opts ...Option) *Agent {

 // Run executes the agent with a user prompt. Each call is a fresh conversation.
 // The agent loops tool calls automatically until it produces a text response.
-func (a *Agent) Run(ctx context.Context, prompt string) (string, error) {
+// Returns the text response, accumulated token usage, and any error.
+func (a *Agent) Run(ctx context.Context, prompt string) (string, *llm.Usage, error) {
 	return a.RunMessages(ctx, []llm.Message{llm.UserMessage(prompt)})
 }

 // RunMessages executes the agent with full message control.
 // Each call is a fresh conversation. The agent loops tool calls automatically.
-func (a *Agent) RunMessages(ctx context.Context, messages []llm.Message) (string, error) {
+// Returns the text response, accumulated token usage, and any error.
+func (a *Agent) RunMessages(ctx context.Context, messages []llm.Message) (string, *llm.Usage, error) {
 	chat := llm.NewChat(a.model, a.reqOpts...)
 	if a.system != "" {
 		chat.SetSystem(a.system)
@@ -107,7 +109,8 @@ type delegateParams struct {
 func (a *Agent) AsTool(name, description string) llm.Tool {
 	return llm.Define[delegateParams](name, description,
 		func(ctx context.Context, p delegateParams) (string, error) {
-			return a.Run(ctx, p.Input)
+			text, _, err := a.Run(ctx, p.Input)
+			return text, err
 		},
 	)
 }
@@ -29,15 +29,6 @@ func (m *mockProvider) Stream(ctx context.Context, req provider.Request, events
 	return nil
 }

-func (m *mockProvider) lastRequest() provider.Request {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	if len(m.requests) == 0 {
-		return provider.Request{}
-	}
-	return m.requests[len(m.requests)-1]
-}
-
 func newMockModel(fn func(ctx context.Context, req provider.Request) (provider.Response, error)) *llm.Model {
 	mp := &mockProvider{completeFunc: fn}
 	return llm.NewClient(mp).Model("mock-model")
@@ -53,7 +44,7 @@ func TestAgent_Run(t *testing.T) {
 	model := newSimpleMockModel("Hello from agent!")
 	a := New(model, "You are a helpful assistant.")

-	result, err := a.Run(context.Background(), "Say hello")
+	result, _, err := a.Run(context.Background(), "Say hello")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -83,7 +74,7 @@ func TestAgent_Run_WithTools(t *testing.T) {
 	})

 	a := New(model, "You are helpful.", WithTools(llm.NewToolBox(tool)))
-	result, err := a.Run(context.Background(), "Use the greet tool")
+	result, _, err := a.Run(context.Background(), "Use the greet tool")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -147,7 +138,7 @@ func TestAgent_AsTool_ParentChild(t *testing.T) {
 		)),
 	)

-	result, err := parent.Run(context.Background(), "Tell me about Go generics")
+	result, _, err := parent.Run(context.Background(), "Tell me about Go generics")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -169,7 +160,7 @@ func TestAgent_RunMessages(t *testing.T) {
 		llm.UserMessage("Follow up"),
 	}

-	result, err := a.RunMessages(context.Background(), messages)
+	result, _, err := a.RunMessages(context.Background(), messages)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -187,7 +178,7 @@ func TestAgent_ContextCancellation(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	cancel() // Cancel immediately

-	_, err := a.Run(ctx, "This should fail")
+	_, _, err := a.Run(ctx, "This should fail")
 	if err == nil {
 		t.Fatal("expected error from cancelled context")
 	}
@@ -204,7 +195,7 @@ func TestAgent_WithRequestOptions(t *testing.T) {
 		WithRequestOptions(llm.WithTemperature(0.3), llm.WithMaxTokens(100)),
 	)

-	_, err := a.Run(context.Background(), "test")
+	_, _, err := a.Run(context.Background(), "test")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -224,7 +215,7 @@ func TestAgent_Run_Error(t *testing.T) {
 	})
 	a := New(model, "You are helpful.")

-	_, err := a.Run(context.Background(), "test")
+	_, _, err := a.Run(context.Background(), "test")
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
@@ -234,7 +225,7 @@ func TestAgent_EmptySystem(t *testing.T) {
 	model := newSimpleMockModel("no system prompt")
 	a := New(model, "") // Empty system prompt

-	result, err := a.Run(context.Background(), "test")
+	result, _, err := a.Run(context.Background(), "test")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -242,3 +233,34 @@ func TestAgent_EmptySystem(t *testing.T) {
 		t.Errorf("unexpected result: %q", result)
 	}
 }
+
+func TestAgent_Run_ReturnsUsage(t *testing.T) {
+	model := newMockModel(func(ctx context.Context, req provider.Request) (provider.Response, error) {
+		return provider.Response{
+			Text: "result",
+			Usage: &provider.Usage{
+				InputTokens:  100,
+				OutputTokens: 50,
+				TotalTokens:  150,
+			},
+		}, nil
+	})
+
+	a := New(model, "You are helpful.")
+	result, usage, err := a.Run(context.Background(), "test")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if result != "result" {
+		t.Errorf("expected 'result', got %q", result)
+	}
+	if usage == nil {
+		t.Fatal("expected usage, got nil")
+	}
+	if usage.InputTokens != 100 {
+		t.Errorf("expected input 100, got %d", usage.InputTokens)
+	}
+	if usage.OutputTokens != 50 {
+		t.Errorf("expected output 50, got %d", usage.OutputTokens)
+	}
+}
@@ -25,7 +25,7 @@ func Example_researcher() {
 		agent.WithRequestOptions(llm.WithTemperature(0.3)),
 	)

-	result, err := researcher.Run(context.Background(), "What are the latest developments in Go generics?")
+	result, _, err := researcher.Run(context.Background(), "What are the latest developments in Go generics?")
 	if err != nil {
 		fmt.Println("Error:", err)
 		return
@@ -50,7 +50,7 @@ func Example_coder() {
 		)),
 	)

-	result, err := coder.Run(context.Background(),
+	result, _, err := coder.Run(context.Background(),
 		"Create a Go program that prints the first 10 Fibonacci numbers. Save it and run it.")
 	if err != nil {
 		fmt.Println("Error:", err)
@@ -97,7 +97,7 @@ func Example_orchestrator() {
 		)),
 	)

-	result, err := orchestrator.Run(context.Background(),
+	result, _, err := orchestrator.Run(context.Background(),
 		"Research how to implement a binary search tree in Go, then create one with insert and search operations.")
 	if err != nil {
 		fmt.Println("Error:", err)
@@ -270,6 +270,16 @@ func (p *Provider) convertResponse(resp anth.MessagesResponse) provider.Response
 		OutputTokens: resp.Usage.OutputTokens,
 		TotalTokens:  resp.Usage.InputTokens + resp.Usage.OutputTokens,
 	}
+	details := map[string]int{}
+	if resp.Usage.CacheCreationInputTokens > 0 {
+		details[provider.UsageDetailCacheCreationTokens] = resp.Usage.CacheCreationInputTokens
+	}
+	if resp.Usage.CacheReadInputTokens > 0 {
+		details[provider.UsageDetailCachedInputTokens] = resp.Usage.CacheReadInputTokens
+	}
+	if len(details) > 0 {
+		res.Usage.Details = details
+	}

 	return res
 }
@@ -38,44 +38,50 @@ func (c *Chat) SetTools(tb *ToolBox) {
 	c.tools = tb
 }

-// Send sends a user message and returns the assistant's text response.
+// Send sends a user message and returns the assistant's text response along with
+// accumulated token usage from all iterations of the tool-call loop.
 // If the model calls tools, they are executed automatically and the loop
 // continues until the model produces a text response (the "agent loop").
-func (c *Chat) Send(ctx context.Context, text string) (string, error) {
+func (c *Chat) Send(ctx context.Context, text string) (string, *Usage, error) {
 	return c.SendMessage(ctx, UserMessage(text))
 }

 // SendWithImages sends a user message with images attached.
-func (c *Chat) SendWithImages(ctx context.Context, text string, images ...Image) (string, error) {
+func (c *Chat) SendWithImages(ctx context.Context, text string, images ...Image) (string, *Usage, error) {
 	return c.SendMessage(ctx, UserMessageWithImages(text, images...))
 }

-// SendMessage sends an arbitrary message and returns the final text response.
+// SendMessage sends an arbitrary message and returns the final text response along with
+// accumulated token usage from all iterations of the tool-call loop.
 // Handles the full tool-call loop automatically.
-func (c *Chat) SendMessage(ctx context.Context, msg Message) (string, error) {
+func (c *Chat) SendMessage(ctx context.Context, msg Message) (string, *Usage, error) {
 	c.messages = append(c.messages, msg)

 	opts := c.buildOpts()

+	var totalUsage *Usage
+
 	for {
 		resp, err := c.model.Complete(ctx, c.messages, opts...)
 		if err != nil {
-			return "", fmt.Errorf("completion failed: %w", err)
+			return "", totalUsage, fmt.Errorf("completion failed: %w", err)
 		}

+		totalUsage = addUsage(totalUsage, resp.Usage)
+
 		c.messages = append(c.messages, resp.Message())

 		if !resp.HasToolCalls() {
-			return resp.Text, nil
+			return resp.Text, totalUsage, nil
 		}

 		if c.tools == nil {
-			return "", ErrNoToolsConfigured
+			return "", totalUsage, ErrNoToolsConfigured
 		}

 		toolResults, err := c.tools.ExecuteAll(ctx, resp.ToolCalls)
 		if err != nil {
-			return "", fmt.Errorf("tool execution failed: %w", err)
+			return "", totalUsage, fmt.Errorf("tool execution failed: %w", err)
 		}

 		c.messages = append(c.messages, toolResults...)
@@ -14,7 +14,7 @@ func TestChat_Send(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	text, err := chat.Send(context.Background(), "Hi")
+	text, _, err := chat.Send(context.Background(), "Hi")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -28,7 +28,7 @@ func TestChat_SendMessage(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, err := chat.SendMessage(context.Background(), UserMessage("msg1"))
+	_, _, err := chat.SendMessage(context.Background(), UserMessage("msg1"))
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -79,7 +79,7 @@ func TestChat_SetSystem(t *testing.T) {
 	}

 	// System message stays first even after adding other messages
-	_, _ = chat.Send(context.Background(), "Hi")
+	_, _, _ = chat.Send(context.Background(), "Hi")
 	chat.SetSystem("New system")
 	msgs = chat.Messages()
 	if msgs[0].Role != RoleSystem {
@@ -113,7 +113,7 @@ func TestChat_ToolCallLoop(t *testing.T) {
 	})
 	chat.SetTools(NewToolBox(tool))

-	text, err := chat.Send(context.Background(), "test")
+	text, _, err := chat.Send(context.Background(), "test")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -158,7 +158,7 @@ func TestChat_ToolCallLoop_NoTools(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, err := chat.Send(context.Background(), "test")
+	_, _, err := chat.Send(context.Background(), "test")
 	if !errors.Is(err, ErrNoToolsConfigured) {
 		t.Errorf("expected ErrNoToolsConfigured, got %v", err)
 	}
@@ -248,7 +248,7 @@ func TestChat_Messages(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, _ = chat.Send(context.Background(), "test")
+	_, _, _ = chat.Send(context.Background(), "test")

 	msgs := chat.Messages()
 	// Verify it's a copy — modifying returned slice shouldn't affect chat
@@ -265,7 +265,7 @@ func TestChat_Reset(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, _ = chat.Send(context.Background(), "test")
+	_, _, _ = chat.Send(context.Background(), "test")
 	if len(chat.Messages()) == 0 {
 		t.Fatal("expected messages before reset")
 	}
@@ -281,7 +281,7 @@ func TestChat_Fork(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, _ = chat.Send(context.Background(), "msg1")
+	_, _, _ = chat.Send(context.Background(), "msg1")

 	fork := chat.Fork()

@@ -291,14 +291,14 @@ func TestChat_Fork(t *testing.T) {
 	}

 	// Adding to fork should not affect original
-	_, _ = fork.Send(context.Background(), "msg2")
+	_, _, _ = fork.Send(context.Background(), "msg2")
 	if len(fork.Messages()) == len(chat.Messages()) {
 		t.Error("fork messages should be independent of original")
 	}

 	// Adding to original should not affect fork
 	originalLen := len(chat.Messages())
-	_, _ = chat.Send(context.Background(), "msg3")
+	_, _, _ = chat.Send(context.Background(), "msg3")
 	if len(chat.Messages()) == originalLen {
 		t.Error("original should have more messages after send")
 	}
@@ -310,7 +310,7 @@ func TestChat_SendWithImages(t *testing.T) {
 	chat := NewChat(model)

 	img := Image{URL: "https://example.com/image.png"}
-	text, err := chat.SendWithImages(context.Background(), "What's in this image?", img)
+	text, _, err := chat.SendWithImages(context.Background(), "What's in this image?", img)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -355,7 +355,7 @@ func TestChat_MultipleToolCallRounds(t *testing.T) {
 	})
 	chat.SetTools(NewToolBox(tool))

-	text, err := chat.Send(context.Background(), "count three times")
+	text, _, err := chat.Send(context.Background(), "count three times")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -378,7 +378,7 @@ func TestChat_SendError(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model)

-	_, err := chat.Send(context.Background(), "test")
+	_, _, err := chat.Send(context.Background(), "test")
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
@@ -392,7 +392,7 @@ func TestChat_WithRequestOptions(t *testing.T) {
 	model := newMockModel(mp)
 	chat := NewChat(model, WithTemperature(0.5), WithMaxTokens(200))

-	_, err := chat.Send(context.Background(), "test")
+	_, _, err := chat.Send(context.Background(), "test")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -405,3 +405,107 @@ func TestChat_WithRequestOptions(t *testing.T) {
 		t.Errorf("expected maxTokens 200, got %v", req.MaxTokens)
 	}
 }
+
+func TestChat_Send_UsageAccumulation(t *testing.T) {
+	var callCount int32
+	mp := newMockProviderFunc(func(ctx context.Context, req provider.Request) (provider.Response, error) {
+		n := atomic.AddInt32(&callCount, 1)
+		if n == 1 {
+			return provider.Response{
+				ToolCalls: []provider.ToolCall{
+					{ID: "tc1", Name: "greet", Arguments: "{}"},
+				},
+				Usage: &provider.Usage{InputTokens: 10, OutputTokens: 5, TotalTokens: 15},
+			}, nil
+		}
+		return provider.Response{
+			Text:  "done",
+			Usage: &provider.Usage{InputTokens: 20, OutputTokens: 8, TotalTokens: 28},
+		}, nil
+	})
+	model := newMockModel(mp)
+	chat := NewChat(model)
+
+	tool := DefineSimple("greet", "Says hello", func(ctx context.Context) (string, error) {
+		return "hello!", nil
+	})
+	chat.SetTools(NewToolBox(tool))
+
+	text, usage, err := chat.Send(context.Background(), "test")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if text != "done" {
+		t.Errorf("expected 'done', got %q", text)
+	}
+	if usage == nil {
+		t.Fatal("expected usage, got nil")
+	}
+	if usage.InputTokens != 30 {
+		t.Errorf("expected accumulated input 30, got %d", usage.InputTokens)
+	}
+	if usage.OutputTokens != 13 {
+		t.Errorf("expected accumulated output 13, got %d", usage.OutputTokens)
+	}
+	if usage.TotalTokens != 43 {
+		t.Errorf("expected accumulated total 43, got %d", usage.TotalTokens)
+	}
+}
+
+func TestChat_Send_UsageWithDetails(t *testing.T) {
+	var callCount int32
+	mp := newMockProviderFunc(func(ctx context.Context, req provider.Request) (provider.Response, error) {
+		n := atomic.AddInt32(&callCount, 1)
+		if n == 1 {
+			return provider.Response{
+				ToolCalls: []provider.ToolCall{
+					{ID: "tc1", Name: "greet", Arguments: "{}"},
+				},
+				Usage: &provider.Usage{
+					InputTokens:  10,
+					OutputTokens: 5,
+					TotalTokens:  15,
+					Details: map[string]int{
+						"cached_input_tokens": 3,
+					},
+				},
+			}, nil
+		}
+		return provider.Response{
+			Text: "done",
+			Usage: &provider.Usage{
+				InputTokens:  20,
+				OutputTokens: 8,
+				TotalTokens:  28,
+				Details: map[string]int{
+					"cached_input_tokens": 7,
+					"reasoning_tokens":    2,
+				},
+			},
+		}, nil
+	})
+	model := newMockModel(mp)
+	chat := NewChat(model)
+
+	tool := DefineSimple("greet", "Says hello", func(ctx context.Context) (string, error) {
+		return "hello!", nil
+	})
+	chat.SetTools(NewToolBox(tool))
+
+	_, usage, err := chat.Send(context.Background(), "test")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if usage == nil {
+		t.Fatal("expected usage, got nil")
+	}
+	if usage.Details == nil {
+		t.Fatal("expected usage details, got nil")
+	}
+	if usage.Details["cached_input_tokens"] != 10 {
+		t.Errorf("expected cached_input_tokens=10, got %d", usage.Details["cached_input_tokens"])
+	}
+	if usage.Details["reasoning_tokens"] != 2 {
+		t.Errorf("expected reasoning_tokens=2, got %d", usage.Details["reasoning_tokens"])
+	}
+}
@@ -13,14 +13,16 @@ const structuredOutputToolName = "structured_output"
 // Generate sends a single user prompt to the model and parses the response into T.
 // T must be a struct. The model is forced to return structured output matching T's schema
 // by using a hidden tool call internally.
-func Generate[T any](ctx context.Context, model *Model, prompt string, opts ...RequestOption) (T, error) {
+// Returns the parsed value, token usage, and any error.
+func Generate[T any](ctx context.Context, model *Model, prompt string, opts ...RequestOption) (T, *Usage, error) {
 	return GenerateWith[T](ctx, model, []Message{UserMessage(prompt)}, opts...)
 }

 // GenerateWith sends the given messages to the model and parses the response into T.
 // T must be a struct. The model is forced to return structured output matching T's schema
 // by using a hidden tool call internally.
-func GenerateWith[T any](ctx context.Context, model *Model, messages []Message, opts ...RequestOption) (T, error) {
+// Returns the parsed value, token usage, and any error.
+func GenerateWith[T any](ctx context.Context, model *Model, messages []Message, opts ...RequestOption) (T, *Usage, error) {
 	var zero T

 	s := schema.FromStruct(zero)
@@ -36,7 +38,7 @@ func GenerateWith[T any](ctx context.Context, model *Model, messages []Message,

 	resp, err := model.Complete(ctx, messages, opts...)
 	if err != nil {
-		return zero, err
+		return zero, nil, err
 	}

 	// Find the structured_output tool call in the response.
@@ -44,11 +46,11 @@ func GenerateWith[T any](ctx context.Context, model *Model, messages []Message,
 		if tc.Name == structuredOutputToolName {
 			var result T
 			if err := json.Unmarshal([]byte(tc.Arguments), &result); err != nil {
-				return zero, fmt.Errorf("failed to parse structured output: %w", err)
+				return zero, resp.Usage, fmt.Errorf("failed to parse structured output: %w", err)
 			}
-			return result, nil
+			return result, resp.Usage, nil
 		}
 	}

-	return zero, ErrNoStructuredOutput
+	return zero, resp.Usage, ErrNoStructuredOutput
 }
@@ -25,7 +25,7 @@ func TestGenerate(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	result, err := Generate[testPerson](context.Background(), model, "Tell me about Alice")
+	result, _, err := Generate[testPerson](context.Background(), model, "Tell me about Alice")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -63,7 +63,7 @@ func TestGenerateWith(t *testing.T) {
 		UserMessage("Tell me about Bob"),
 	}

-	result, err := GenerateWith[testPerson](context.Background(), model, messages)
+	result, _, err := GenerateWith[testPerson](context.Background(), model, messages)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -90,7 +90,7 @@ func TestGenerate_NoToolCall(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	_, err := Generate[testPerson](context.Background(), model, "Tell me about someone")
+	_, _, err := Generate[testPerson](context.Background(), model, "Tell me about someone")
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
@@ -111,7 +111,7 @@ func TestGenerate_InvalidJSON(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	_, err := Generate[testPerson](context.Background(), model, "Tell me about someone")
+	_, _, err := Generate[testPerson](context.Background(), model, "Tell me about someone")
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
@@ -143,7 +143,7 @@ func TestGenerate_NestedStruct(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	result, err := Generate[testPersonWithAddress](context.Background(), model, "Tell me about Carol")
+	result, _, err := Generate[testPersonWithAddress](context.Background(), model, "Tell me about Carol")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -170,7 +170,7 @@ func TestGenerate_WithOptions(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	_, err := Generate[testPerson](context.Background(), model, "Tell me about Dave",
+	_, _, err := Generate[testPerson](context.Background(), model, "Tell me about Dave",
 		WithTemperature(0.5),
 		WithMaxTokens(200),
 	)
@@ -207,7 +207,7 @@ func TestGenerate_WithMiddleware(t *testing.T) {
 	})
 	model := newMockModel(mp).WithMiddleware(mw)

-	result, err := Generate[testPerson](context.Background(), model, "Tell me about Eve")
+	result, _, err := Generate[testPerson](context.Background(), model, "Tell me about Eve")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -231,7 +231,7 @@ func TestGenerate_WrongToolName(t *testing.T) {
 	})
 	model := newMockModel(mp)

-	_, err := Generate[testPerson](context.Background(), model, "Tell me about Frank")
+	_, _, err := Generate[testPerson](context.Background(), model, "Tell me about Frank")
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
@@ -239,3 +239,44 @@ func TestGenerate_WrongToolName(t *testing.T) {
 		t.Errorf("expected ErrNoStructuredOutput, got %v", err)
 	}
 }
+
+func TestGenerate_ReturnsUsage(t *testing.T) {
+	mp := newMockProvider(provider.Response{
+		ToolCalls: []provider.ToolCall{
+			{
+				ID:        "call_1",
+				Name:      "structured_output",
+				Arguments: `{"name":"Grace","age":22}`,
+			},
+		},
+		Usage: &provider.Usage{
+			InputTokens:  50,
+			OutputTokens: 20,
+			TotalTokens:  70,
+			Details: map[string]int{
+				"reasoning_tokens": 5,
+			},
+		},
+	})
+	model := newMockModel(mp)
+
+	result, usage, err := Generate[testPerson](context.Background(), model, "Tell me about Grace")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if result.Name != "Grace" {
+		t.Errorf("expected name 'Grace', got %q", result.Name)
+	}
+	if usage == nil {
+		t.Fatal("expected usage, got nil")
+	}
+	if usage.InputTokens != 50 {
+		t.Errorf("expected input 50, got %d", usage.InputTokens)
+	}
+	if usage.OutputTokens != 20 {
+		t.Errorf("expected output 20, got %d", usage.OutputTokens)
+	}
+	if usage.Details["reasoning_tokens"] != 5 {
+		t.Errorf("expected reasoning_tokens=5, got %d", usage.Details["reasoning_tokens"])
+	}
+}
@@ -59,12 +59,32 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan

 	var fullText strings.Builder
 	var toolCalls []provider.ToolCall
+	var usage *provider.Usage

 	for resp, err := range cl.Models.GenerateContentStream(ctx, req.Model, contents, cfg) {
 		if err != nil {
 			return fmt.Errorf("google stream error: %w", err)
 		}

+		// Track usage from the last chunk (final chunk has cumulative counts)
+		if resp.UsageMetadata != nil {
+			usage = &provider.Usage{
+				InputTokens:  int(resp.UsageMetadata.PromptTokenCount),
+				OutputTokens: int(resp.UsageMetadata.CandidatesTokenCount),
+				TotalTokens:  int(resp.UsageMetadata.TotalTokenCount),
+			}
+			details := map[string]int{}
+			if resp.UsageMetadata.CachedContentTokenCount > 0 {
+				details[provider.UsageDetailCachedInputTokens] = int(resp.UsageMetadata.CachedContentTokenCount)
+			}
+			if resp.UsageMetadata.ThoughtsTokenCount > 0 {
+				details[provider.UsageDetailThoughtsTokens] = int(resp.UsageMetadata.ThoughtsTokenCount)
+			}
+			if len(details) > 0 {
+				usage.Details = details
+			}
+		}
+
 		for _, c := range resp.Candidates {
 			if c.Content == nil {
 				continue
@@ -105,6 +125,7 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan
 		Response: &provider.Response{
 			Text:      fullText.String(),
 			ToolCalls: toolCalls,
+			Usage:     usage,
 		},
 	}

@@ -284,6 +305,16 @@ func (p *Provider) convertResponse(resp *genai.GenerateContentResponse) (provide
 			OutputTokens: int(resp.UsageMetadata.CandidatesTokenCount),
 			TotalTokens:  int(resp.UsageMetadata.TotalTokenCount),
 		}
+		details := map[string]int{}
+		if resp.UsageMetadata.CachedContentTokenCount > 0 {
+			details[provider.UsageDetailCachedInputTokens] = int(resp.UsageMetadata.CachedContentTokenCount)
+		}
+		if resp.UsageMetadata.ThoughtsTokenCount > 0 {
+			details[provider.UsageDetailThoughtsTokens] = int(resp.UsageMetadata.ThoughtsTokenCount)
+		}
+		if len(details) > 0 {
+			res.Usage.Details = details
+		}
 	}

 	return res, nil
@@ -177,6 +177,7 @@ func convertProviderResponse(resp provider.Response) Response {
 			InputTokens:  resp.Usage.InputTokens,
 			OutputTokens: resp.Usage.OutputTokens,
 			TotalTokens:  resp.Usage.TotalTokens,
+			Details:      resp.Usage.Details,
 		}
 	}

@@ -82,6 +82,7 @@ type UsageTracker struct {
 	TotalInput    int64
 	TotalOutput   int64
 	TotalRequests int64
+	TotalDetails  map[string]int64
 }

 // Add records usage from a single request.
@@ -94,6 +95,14 @@ func (ut *UsageTracker) Add(u *Usage) {
 	ut.TotalInput += int64(u.InputTokens)
 	ut.TotalOutput += int64(u.OutputTokens)
 	ut.TotalRequests++
+	if len(u.Details) > 0 {
+		if ut.TotalDetails == nil {
+			ut.TotalDetails = make(map[string]int64)
+		}
+		for k, v := range u.Details {
+			ut.TotalDetails[k] += int64(v)
+		}
+	}
 }

 // Summary returns the accumulated totals.
@@ -103,6 +112,20 @@ func (ut *UsageTracker) Summary() (input, output, requests int64) {
 	return ut.TotalInput, ut.TotalOutput, ut.TotalRequests
 }

+// Details returns a copy of the accumulated detail totals.
+func (ut *UsageTracker) Details() map[string]int64 {
+	ut.mu.Lock()
+	defer ut.mu.Unlock()
+	if ut.TotalDetails == nil {
+		return nil
+	}
+	cp := make(map[string]int64, len(ut.TotalDetails))
+	for k, v := range ut.TotalDetails {
+		cp[k] = v
+	}
+	return cp
+}
+
 // WithUsageTracking returns middleware that accumulates token usage across calls.
 func WithUsageTracking(tracker *UsageTracker) Middleware {
 	return func(next CompletionFunc) CompletionFunc {
@@ -280,3 +280,80 @@ func TestWithLogging_Error(t *testing.T) {
 		t.Errorf("expected provider error, got %v", err)
 	}
 }
+
+func TestUsageTracker_Details(t *testing.T) {
+	tracker := &UsageTracker{}
+
+	tracker.Add(&Usage{
+		InputTokens:  100,
+		OutputTokens: 50,
+		TotalTokens:  150,
+		Details: map[string]int{
+			"cached_input_tokens": 20,
+			"reasoning_tokens":    10,
+		},
+	})
+
+	tracker.Add(&Usage{
+		InputTokens:  80,
+		OutputTokens: 40,
+		TotalTokens:  120,
+		Details: map[string]int{
+			"cached_input_tokens": 15,
+		},
+	})
+
+	details := tracker.Details()
+	if details == nil {
+		t.Fatal("expected details, got nil")
+	}
+	if details["cached_input_tokens"] != 35 {
+		t.Errorf("expected cached_input_tokens=35, got %d", details["cached_input_tokens"])
+	}
+	if details["reasoning_tokens"] != 10 {
+		t.Errorf("expected reasoning_tokens=10, got %d", details["reasoning_tokens"])
+	}
+
+	// Verify returned map is a copy
+	details["cached_input_tokens"] = 999
+	fresh := tracker.Details()
+	if fresh["cached_input_tokens"] != 35 {
+		t.Error("Details() did not return a copy")
+	}
+}
+
+func TestUsageTracker_Details_Nil(t *testing.T) {
+	tracker := &UsageTracker{}
+	tracker.Add(&Usage{InputTokens: 10, OutputTokens: 5, TotalTokens: 15})
+
+	details := tracker.Details()
+	if details != nil {
+		t.Errorf("expected nil details for usage without details, got %v", details)
+	}
+}
+
+func TestWithUsageTracking_WithDetails(t *testing.T) {
+	mp := newMockProvider(provider.Response{
+		Text: "ok",
+		Usage: &provider.Usage{
+			InputTokens:  100,
+			OutputTokens: 50,
+			TotalTokens:  150,
+			Details: map[string]int{
+				"cached_input_tokens": 30,
+			},
+		},
+	})
+	tracker := &UsageTracker{}
+	model := newMockModel(mp).WithMiddleware(WithUsageTracking(tracker))
+
+	_, err := model.Complete(context.Background(), []Message{UserMessage("test")})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	details := tracker.Details()
+	if details["cached_input_tokens"] != 30 {
+		t.Errorf("expected cached_input_tokens=30, got %d", details["cached_input_tokens"])
+	}
+}
@@ -58,15 +58,30 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan

 	cl := openai.NewClient(opts...)
 	oaiReq := p.buildRequest(req)
+	oaiReq.StreamOptions = openai.ChatCompletionStreamOptionsParam{
+		IncludeUsage: openai.Bool(true),
+	}

 	stream := cl.Chat.Completions.NewStreaming(ctx, oaiReq)

 	var fullText strings.Builder
 	var toolCalls []provider.ToolCall
 	toolCallArgs := map[int]*strings.Builder{}
+	var usage *provider.Usage

 	for stream.Next() {
 		chunk := stream.Current()
+
+		// Capture usage from the final chunk (present when StreamOptions.IncludeUsage is true)
+		if chunk.Usage.TotalTokens > 0 {
+			usage = &provider.Usage{
+				InputTokens:  int(chunk.Usage.PromptTokens),
+				OutputTokens: int(chunk.Usage.CompletionTokens),
+				TotalTokens:  int(chunk.Usage.TotalTokens),
+				Details:      extractUsageDetails(chunk.Usage),
+			}
+		}
+
 		for _, choice := range chunk.Choices {
 			// Text delta
 			if choice.Delta.Content != "" {
@@ -138,6 +153,7 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan
 		Response: &provider.Response{
 			Text:      fullText.String(),
 			ToolCalls: toolCalls,
+			Usage:     usage,
 		},
 	}

@@ -363,6 +379,7 @@ func (p *Provider) convertResponse(resp *openai.ChatCompletion) provider.Respons
 			OutputTokens: int(resp.Usage.CompletionTokens),
 			TotalTokens:  int(resp.Usage.TotalTokens),
 		}
+		res.Usage.Details = extractUsageDetails(resp.Usage)
 	}

 	return res
@@ -381,6 +398,27 @@ func audioFormat(contentType string) string {
 	}
 }

+// extractUsageDetails extracts provider-specific detail tokens from an OpenAI CompletionUsage.
+func extractUsageDetails(usage openai.CompletionUsage) map[string]int {
+	details := map[string]int{}
+	if usage.CompletionTokensDetails.ReasoningTokens > 0 {
+		details[provider.UsageDetailReasoningTokens] = int(usage.CompletionTokensDetails.ReasoningTokens)
+	}
+	if usage.CompletionTokensDetails.AudioTokens > 0 {
+		details[provider.UsageDetailAudioOutputTokens] = int(usage.CompletionTokensDetails.AudioTokens)
+	}
+	if usage.PromptTokensDetails.CachedTokens > 0 {
+		details[provider.UsageDetailCachedInputTokens] = int(usage.PromptTokensDetails.CachedTokens)
+	}
+	if usage.PromptTokensDetails.AudioTokens > 0 {
+		details[provider.UsageDetailAudioInputTokens] = int(usage.PromptTokensDetails.AudioTokens)
+	}
+	if len(details) == 0 {
+		return nil
+	}
+	return details
+}
+
 // audioFormatFromURL guesses the audio format from a URL's file extension.
 func audioFormatFromURL(u string) string {
 	ext := strings.ToLower(path.Ext(u))
@@ -0,0 +1,83 @@
+package llm
+
+import "sync"
+
+// ModelPricing defines per-token pricing for a model.
+type ModelPricing struct {
+	InputPricePerToken       float64 // USD per input token
+	OutputPricePerToken      float64 // USD per output token
+	CachedInputPricePerToken float64 // USD per cached input token (0 = same as input)
+}
+
+// Cost computes the total USD cost from a Usage.
+// When CachedInputPricePerToken is set and the usage includes cached_input_tokens,
+// those tokens are charged at the cached rate instead of the regular input rate.
+func (mp ModelPricing) Cost(u *Usage) float64 {
+	if u == nil {
+		return 0
+	}
+
+	inputTokens := u.InputTokens
+	cachedTokens := 0
+	if u.Details != nil {
+		cachedTokens = u.Details[UsageDetailCachedInputTokens]
+	}
+
+	var cost float64
+
+	if mp.CachedInputPricePerToken > 0 && cachedTokens > 0 {
+		regularInput := inputTokens - cachedTokens
+		if regularInput < 0 {
+			regularInput = 0
+		}
+		cost += float64(regularInput) * mp.InputPricePerToken
+		cost += float64(cachedTokens) * mp.CachedInputPricePerToken
+	} else {
+		cost += float64(inputTokens) * mp.InputPricePerToken
+	}
+
+	cost += float64(u.OutputTokens) * mp.OutputPricePerToken
+
+	return cost
+}
+
+// PricingRegistry maps model names to their pricing.
+// Callers populate it with the models and prices relevant to their use case.
+type PricingRegistry struct {
+	mu     sync.RWMutex
+	models map[string]ModelPricing
+}
+
+// NewPricingRegistry creates an empty pricing registry.
+func NewPricingRegistry() *PricingRegistry {
+	return &PricingRegistry{
+		models: make(map[string]ModelPricing),
+	}
+}
+
+// Set registers pricing for a model.
+func (pr *PricingRegistry) Set(model string, pricing ModelPricing) {
+	pr.mu.Lock()
+	defer pr.mu.Unlock()
+	pr.models[model] = pricing
+}
+
+// Has returns true if pricing is registered for the given model.
+func (pr *PricingRegistry) Has(model string) bool {
+	pr.mu.RLock()
+	defer pr.mu.RUnlock()
+	_, ok := pr.models[model]
+	return ok
+}
+
+// Cost computes the USD cost for the given model and usage.
+// Returns 0 if the model is not registered.
+func (pr *PricingRegistry) Cost(model string, u *Usage) float64 {
+	pr.mu.RLock()
+	pricing, ok := pr.models[model]
+	pr.mu.RUnlock()
+	if !ok {
+		return 0
+	}
+	return pricing.Cost(u)
+}
@@ -0,0 +1,128 @@
+package llm
+
+import (
+	"math"
+	"testing"
+)
+
+func TestModelPricing_Cost(t *testing.T) {
+	pricing := ModelPricing{
+		InputPricePerToken:  0.000003, // $3/MTok
+		OutputPricePerToken: 0.000015, // $15/MTok
+	}
+
+	usage := &Usage{
+		InputTokens:  1000,
+		OutputTokens: 500,
+		TotalTokens:  1500,
+	}
+
+	cost := pricing.Cost(usage)
+	expected := 1000*0.000003 + 500*0.000015
+	if math.Abs(cost-expected) > 1e-10 {
+		t.Errorf("expected cost %f, got %f", expected, cost)
+	}
+}
+
+func TestModelPricing_Cost_WithCachedTokens(t *testing.T) {
+	pricing := ModelPricing{
+		InputPricePerToken:       0.000003, // $3/MTok
+		OutputPricePerToken:      0.000015, // $15/MTok
+		CachedInputPricePerToken: 0.0000015, // $1.50/MTok (50% discount)
+	}
+
+	usage := &Usage{
+		InputTokens:  1000,
+		OutputTokens: 500,
+		TotalTokens:  1500,
+		Details: map[string]int{
+			UsageDetailCachedInputTokens: 400,
+		},
+	}
+
+	cost := pricing.Cost(usage)
+	// 600 regular input tokens + 400 cached tokens + 500 output tokens
+	expected := 600*0.000003 + 400*0.0000015 + 500*0.000015
+	if math.Abs(cost-expected) > 1e-10 {
+		t.Errorf("expected cost %f, got %f", expected, cost)
+	}
+}
+
+func TestModelPricing_Cost_NilUsage(t *testing.T) {
+	pricing := ModelPricing{
+		InputPricePerToken:  0.000003,
+		OutputPricePerToken: 0.000015,
+	}
+
+	cost := pricing.Cost(nil)
+	if cost != 0 {
+		t.Errorf("expected 0 for nil usage, got %f", cost)
+	}
+}
+
+func TestModelPricing_Cost_NoCachedPrice(t *testing.T) {
+	// When CachedInputPricePerToken is 0, all input tokens use InputPricePerToken
+	pricing := ModelPricing{
+		InputPricePerToken:  0.000003,
+		OutputPricePerToken: 0.000015,
+	}
+
+	usage := &Usage{
+		InputTokens:  1000,
+		OutputTokens: 500,
+		TotalTokens:  1500,
+		Details: map[string]int{
+			UsageDetailCachedInputTokens: 400,
+		},
+	}
+
+	cost := pricing.Cost(usage)
+	expected := 1000*0.000003 + 500*0.000015
+	if math.Abs(cost-expected) > 1e-10 {
+		t.Errorf("expected cost %f, got %f", expected, cost)
+	}
+}
+
+func TestPricingRegistry(t *testing.T) {
+	registry := NewPricingRegistry()
+
+	registry.Set("gpt-4o", ModelPricing{
+		InputPricePerToken:  0.0000025,
+		OutputPricePerToken: 0.00001,
+	})
+
+	if !registry.Has("gpt-4o") {
+		t.Error("expected Has('gpt-4o') to be true")
+	}
+	if registry.Has("gpt-3.5-turbo") {
+		t.Error("expected Has('gpt-3.5-turbo') to be false")
+	}
+
+	usage := &Usage{InputTokens: 1000, OutputTokens: 200, TotalTokens: 1200}
+
+	cost := registry.Cost("gpt-4o", usage)
+	expected := 1000*0.0000025 + 200*0.00001
+	if math.Abs(cost-expected) > 1e-10 {
+		t.Errorf("expected cost %f, got %f", expected, cost)
+	}
+
+	// Unknown model returns 0
+	cost = registry.Cost("unknown-model", usage)
+	if cost != 0 {
+		t.Errorf("expected 0 for unknown model, got %f", cost)
+	}
+}
+
+func TestPricingRegistry_Override(t *testing.T) {
+	registry := NewPricingRegistry()
+
+	registry.Set("model-a", ModelPricing{InputPricePerToken: 0.001, OutputPricePerToken: 0.002})
+	registry.Set("model-a", ModelPricing{InputPricePerToken: 0.003, OutputPricePerToken: 0.004})
+
+	usage := &Usage{InputTokens: 100, OutputTokens: 50, TotalTokens: 150}
+	cost := registry.Cost("model-a", usage)
+	expected := 100*0.003 + 50*0.004
+	if math.Abs(cost-expected) > 1e-10 {
+		t.Errorf("expected overridden cost %f, got %f", expected, cost)
+	}
+}
@@ -64,8 +64,19 @@ type Usage struct {
 	InputTokens  int
 	OutputTokens int
 	TotalTokens  int
+	Details      map[string]int // provider-specific breakdown (e.g., cached, reasoning tokens)
 }

+// Standardized detail keys for provider-specific token breakdowns.
+const (
+	UsageDetailReasoningTokens     = "reasoning_tokens"
+	UsageDetailCachedInputTokens   = "cached_input_tokens"
+	UsageDetailCacheCreationTokens = "cache_creation_tokens"
+	UsageDetailAudioInputTokens    = "audio_input_tokens"
+	UsageDetailAudioOutputTokens   = "audio_output_tokens"
+	UsageDetailThoughtsTokens      = "thoughts_tokens"
+)
+
 // StreamEventType identifies the kind of stream event.
 type StreamEventType int

@@ -1,5 +1,7 @@
 package llm

+import "gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
+
 // Response represents the result of a completion request.
 type Response struct {
 	// Text is the assistant's text content. Empty if only tool calls.
@@ -31,4 +33,45 @@ type Usage struct {
 	InputTokens  int
 	OutputTokens int
 	TotalTokens  int
+	Details      map[string]int // provider-specific breakdown (e.g., cached, reasoning tokens)
 }
+
+// addUsage merges usage u into the receiver, accumulating token counts and details.
+// If the receiver is nil, it returns a copy of u. If u is nil, it returns the receiver unchanged.
+func addUsage(total *Usage, u *Usage) *Usage {
+	if u == nil {
+		return total
+	}
+	if total == nil {
+		cp := *u
+		if u.Details != nil {
+			cp.Details = make(map[string]int, len(u.Details))
+			for k, v := range u.Details {
+				cp.Details[k] = v
+			}
+		}
+		return &cp
+	}
+	total.InputTokens += u.InputTokens
+	total.OutputTokens += u.OutputTokens
+	total.TotalTokens += u.TotalTokens
+	if u.Details != nil {
+		if total.Details == nil {
+			total.Details = make(map[string]int, len(u.Details))
+		}
+		for k, v := range u.Details {
+			total.Details[k] += v
+		}
+	}
+	return total
+}
+
+// Re-export detail key constants from provider package for convenience.
+const (
+	UsageDetailReasoningTokens     = provider.UsageDetailReasoningTokens
+	UsageDetailCachedInputTokens   = provider.UsageDetailCachedInputTokens
+	UsageDetailCacheCreationTokens = provider.UsageDetailCacheCreationTokens
+	UsageDetailAudioInputTokens    = provider.UsageDetailAudioInputTokens
+	UsageDetailAudioOutputTokens   = provider.UsageDetailAudioOutputTokens
+	UsageDetailThoughtsTokens      = provider.UsageDetailThoughtsTokens
+)