From 7e1705c385af42bbb3dfe8b8fffa59a61b072305 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 8 Feb 2026 21:00:56 -0500 Subject: [PATCH] feat: add audio input support to v2 providers Add Audio struct alongside Image for sending audio attachments to multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3), Google Gemini uses genai.NewPartFromBytes, and Anthropic skips audio gracefully since it's not supported. Co-Authored-By: Claude Opus 4.6 --- v2/anthropic/anthropic.go | 2 ++ v2/google/google.go | 33 ++++++++++++++++++ v2/llm.go | 7 ++++ v2/message.go | 16 ++++++++- v2/openai/openai.go | 72 +++++++++++++++++++++++++++++++++++++++ v2/provider/provider.go | 8 +++++ 6 files changed, 137 insertions(+), 1 deletion(-) diff --git a/v2/anthropic/anthropic.go b/v2/anthropic/anthropic.go index beaf64b..201092b 100644 --- a/v2/anthropic/anthropic.go +++ b/v2/anthropic/anthropic.go @@ -204,6 +204,8 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest { } } + // Audio is not supported by Anthropic — skip silently. + // Merge consecutive same-role messages (Anthropic requires alternating) if len(msgs) > 0 && msgs[len(msgs)-1].Role == role { msgs[len(msgs)-1].Content = append(msgs[len(msgs)-1].Content, m.Content...) diff --git a/v2/google/google.go b/v2/google/google.go index 180f721..2f99060 100644 --- a/v2/google/google.go +++ b/v2/google/google.go @@ -217,6 +217,39 @@ func (p *Provider) buildRequest(req provider.Request) ([]*genai.Content, *genai. } } + for _, aud := range msg.Audio { + if aud.URL != "" { + resp, err := http.Get(aud.URL) + if err != nil { + continue + } + data, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err != nil { + continue + } + + mimeType := resp.Header.Get("Content-Type") + if mimeType == "" { + mimeType = aud.ContentType + } + if mimeType == "" { + mimeType = "audio/wav" + } + parts = append(parts, genai.NewPartFromBytes(data, mimeType)) + } else if aud.Base64 != "" { + data, err := base64.StdEncoding.DecodeString(aud.Base64) + if err != nil { + continue + } + ct := aud.ContentType + if ct == "" { + ct = "audio/wav" + } + parts = append(parts, genai.NewPartFromBytes(data, ct)) + } + } + contents = append(contents, genai.NewContentFromParts(parts, role)) } diff --git a/v2/llm.go b/v2/llm.go index 74075ec..388565d 100644 --- a/v2/llm.go +++ b/v2/llm.go @@ -140,6 +140,13 @@ func convertMessages(msgs []Message) []provider.Message { ContentType: img.ContentType, }) } + for _, aud := range m.Content.Audio { + pm.Audio = append(pm.Audio, provider.Audio{ + URL: aud.URL, + Base64: aud.Base64, + ContentType: aud.ContentType, + }) + } for _, tc := range m.ToolCalls { pm.ToolCalls = append(pm.ToolCalls, provider.ToolCall{ ID: tc.ID, diff --git a/v2/message.go b/v2/message.go index 43185b5..1a24148 100644 --- a/v2/message.go +++ b/v2/message.go @@ -18,10 +18,19 @@ type Image struct { ContentType string // MIME type (e.g., "image/png"), required for Base64 } -// Content represents message content with optional text and images. +// Audio represents an audio attachment. +type Audio struct { + // Provide exactly one of URL or Base64. + URL string // HTTP(S) URL to audio file + Base64 string // Raw base64-encoded audio data + ContentType string // MIME type (e.g., "audio/wav", "audio/mp3") +} + +// Content represents message content with optional text, images, and audio. type Content struct { Text string Images []Image + Audio []Audio } // ToolCall represents a tool invocation requested by the assistant. @@ -53,6 +62,11 @@ func UserMessageWithImages(text string, images ...Image) Message { return Message{Role: RoleUser, Content: Content{Text: text, Images: images}} } +// UserMessageWithAudio creates a user message with text and audio attachments. +func UserMessageWithAudio(text string, audio ...Audio) Message { + return Message{Role: RoleUser, Content: Content{Text: text, Audio: audio}} +} + // SystemMessage creates a system prompt message. func SystemMessage(text string) Message { return Message{Role: RoleSystem, Content: Content{Text: text}} diff --git a/v2/openai/openai.go b/v2/openai/openai.go index ab20adf..2676de6 100644 --- a/v2/openai/openai.go +++ b/v2/openai/openai.go @@ -3,7 +3,11 @@ package openai import ( "context" + "encoding/base64" "fmt" + "io" + "net/http" + "path" "strings" "github.com/openai/openai-go" @@ -206,6 +210,48 @@ func convertMessage(msg provider.Message, model string) openai.ChatCompletionMes } } + for _, aud := range msg.Audio { + var b64Data string + var format string + + if aud.Base64 != "" { + b64Data = aud.Base64 + format = audioFormat(aud.ContentType) + } else if aud.URL != "" { + resp, err := http.Get(aud.URL) + if err != nil { + continue + } + data, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err != nil { + continue + } + b64Data = base64.StdEncoding.EncodeToString(data) + ct := resp.Header.Get("Content-Type") + if ct == "" { + ct = aud.ContentType + } + if ct == "" { + ct = audioFormatFromURL(aud.URL) + } + format = audioFormat(ct) + } + + if b64Data != "" && format != "" { + arrayOfContentParts = append(arrayOfContentParts, + openai.ChatCompletionContentPartUnionParam{ + OfInputAudio: &openai.ChatCompletionContentPartInputAudioParam{ + InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{ + Data: b64Data, + Format: format, + }, + }, + }, + ) + } + } + if msg.Content != "" { if len(arrayOfContentParts) > 0 { arrayOfContentParts = append(arrayOfContentParts, @@ -321,3 +367,29 @@ func (p *Provider) convertResponse(resp *openai.ChatCompletion) provider.Respons return res } + +// audioFormat converts a MIME type to an OpenAI audio format string ("wav" or "mp3"). +func audioFormat(contentType string) string { + ct := strings.ToLower(contentType) + switch { + case strings.Contains(ct, "wav"): + return "wav" + case strings.Contains(ct, "mp3"), strings.Contains(ct, "mpeg"): + return "mp3" + default: + return "wav" + } +} + +// audioFormatFromURL guesses the audio format from a URL's file extension. +func audioFormatFromURL(u string) string { + ext := strings.ToLower(path.Ext(u)) + switch ext { + case ".mp3": + return "audio/mp3" + case ".wav": + return "audio/wav" + default: + return "audio/wav" + } +} diff --git a/v2/provider/provider.go b/v2/provider/provider.go index 02e79d7..084ee83 100644 --- a/v2/provider/provider.go +++ b/v2/provider/provider.go @@ -8,6 +8,7 @@ type Message struct { Role string Content string Images []Image + Audio []Audio ToolCalls []ToolCall ToolCallID string } @@ -19,6 +20,13 @@ type Image struct { ContentType string } +// Audio represents an audio attachment at the provider level. +type Audio struct { + URL string + Base64 string + ContentType string +} + // ToolCall represents a tool invocation requested by the model. type ToolCall struct { ID string