feat: add audio input support to v2 providers

Add Audio struct alongside Image for sending audio attachments to multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3), Google Gemini uses genai.NewPartFromBytes, and Anthropic skips audio gracefully since it's not supported. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 21:00:56 -05:00
parent fc2218b5fe
commit 7e1705c385
6 changed files with 137 additions and 1 deletions
--- a/v2/message.go
+++ b/v2/message.go
@@ -18,10 +18,19 @@ type Image struct {
 	ContentType string // MIME type (e.g., "image/png"), required for Base64
 }

-// Content represents message content with optional text and images.
+// Audio represents an audio attachment.
+type Audio struct {
+	// Provide exactly one of URL or Base64.
+	URL         string // HTTP(S) URL to audio file
+	Base64      string // Raw base64-encoded audio data
+	ContentType string // MIME type (e.g., "audio/wav", "audio/mp3")
+}
+
+// Content represents message content with optional text, images, and audio.
 type Content struct {
 	Text   string
 	Images []Image
+	Audio  []Audio
 }

 // ToolCall represents a tool invocation requested by the assistant.
@@ -53,6 +62,11 @@ func UserMessageWithImages(text string, images ...Image) Message {
 	return Message{Role: RoleUser, Content: Content{Text: text, Images: images}}
 }

+// UserMessageWithAudio creates a user message with text and audio attachments.
+func UserMessageWithAudio(text string, audio ...Audio) Message {
+	return Message{Role: RoleUser, Content: Content{Text: text, Audio: audio}}
+}
+
 // SystemMessage creates a system prompt message.
 func SystemMessage(text string) Message {
 	return Message{Role: RoleSystem, Content: Content{Text: text}}