feat: add audio input support to v2 providers

Add Audio struct alongside Image for sending audio attachments to multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3), Google Gemini uses genai.NewPartFromBytes, and Anthropic skips audio gracefully since it's not supported. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 21:00:56 -05:00
parent fc2218b5fe
commit 7e1705c385
6 changed files with 137 additions and 1 deletions
--- a/v2/provider/provider.go
+++ b/v2/provider/provider.go
@@ -8,6 +8,7 @@ type Message struct {
 	Role       string
 	Content    string
 	Images     []Image
+	Audio      []Audio
 	ToolCalls  []ToolCall
 	ToolCallID string
 }
@@ -19,6 +20,13 @@ type Image struct {
 	ContentType string
 }

+// Audio represents an audio attachment at the provider level.
+type Audio struct {
+	URL         string
+	Base64      string
+	ContentType string
+}
+
 // ToolCall represents a tool invocation requested by the model.
 type ToolCall struct {
 	ID        string