feat: add audio input support to v2 providers
All checks were successful
CI / Lint (push) Successful in 9m37s
CI / Root Module (push) Successful in 10m53s
CI / V2 Module (push) Successful in 11m9s

Add Audio struct alongside Image for sending audio attachments to
multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3),
Google Gemini uses genai.NewPartFromBytes, and Anthropic skips
audio gracefully since it's not supported.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-08 21:00:56 -05:00
parent fc2218b5fe
commit 7e1705c385
6 changed files with 137 additions and 1 deletions

View File

@@ -204,6 +204,8 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
} }
} }
// Audio is not supported by Anthropic — skip silently.
// Merge consecutive same-role messages (Anthropic requires alternating) // Merge consecutive same-role messages (Anthropic requires alternating)
if len(msgs) > 0 && msgs[len(msgs)-1].Role == role { if len(msgs) > 0 && msgs[len(msgs)-1].Role == role {
msgs[len(msgs)-1].Content = append(msgs[len(msgs)-1].Content, m.Content...) msgs[len(msgs)-1].Content = append(msgs[len(msgs)-1].Content, m.Content...)

View File

@@ -217,6 +217,39 @@ func (p *Provider) buildRequest(req provider.Request) ([]*genai.Content, *genai.
} }
} }
for _, aud := range msg.Audio {
if aud.URL != "" {
resp, err := http.Get(aud.URL)
if err != nil {
continue
}
data, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
continue
}
mimeType := resp.Header.Get("Content-Type")
if mimeType == "" {
mimeType = aud.ContentType
}
if mimeType == "" {
mimeType = "audio/wav"
}
parts = append(parts, genai.NewPartFromBytes(data, mimeType))
} else if aud.Base64 != "" {
data, err := base64.StdEncoding.DecodeString(aud.Base64)
if err != nil {
continue
}
ct := aud.ContentType
if ct == "" {
ct = "audio/wav"
}
parts = append(parts, genai.NewPartFromBytes(data, ct))
}
}
contents = append(contents, genai.NewContentFromParts(parts, role)) contents = append(contents, genai.NewContentFromParts(parts, role))
} }

View File

@@ -140,6 +140,13 @@ func convertMessages(msgs []Message) []provider.Message {
ContentType: img.ContentType, ContentType: img.ContentType,
}) })
} }
for _, aud := range m.Content.Audio {
pm.Audio = append(pm.Audio, provider.Audio{
URL: aud.URL,
Base64: aud.Base64,
ContentType: aud.ContentType,
})
}
for _, tc := range m.ToolCalls { for _, tc := range m.ToolCalls {
pm.ToolCalls = append(pm.ToolCalls, provider.ToolCall{ pm.ToolCalls = append(pm.ToolCalls, provider.ToolCall{
ID: tc.ID, ID: tc.ID,

View File

@@ -18,10 +18,19 @@ type Image struct {
ContentType string // MIME type (e.g., "image/png"), required for Base64 ContentType string // MIME type (e.g., "image/png"), required for Base64
} }
// Content represents message content with optional text and images. // Audio represents an audio attachment.
type Audio struct {
// Provide exactly one of URL or Base64.
URL string // HTTP(S) URL to audio file
Base64 string // Raw base64-encoded audio data
ContentType string // MIME type (e.g., "audio/wav", "audio/mp3")
}
// Content represents message content with optional text, images, and audio.
type Content struct { type Content struct {
Text string Text string
Images []Image Images []Image
Audio []Audio
} }
// ToolCall represents a tool invocation requested by the assistant. // ToolCall represents a tool invocation requested by the assistant.
@@ -53,6 +62,11 @@ func UserMessageWithImages(text string, images ...Image) Message {
return Message{Role: RoleUser, Content: Content{Text: text, Images: images}} return Message{Role: RoleUser, Content: Content{Text: text, Images: images}}
} }
// UserMessageWithAudio creates a user message with text and audio attachments.
func UserMessageWithAudio(text string, audio ...Audio) Message {
return Message{Role: RoleUser, Content: Content{Text: text, Audio: audio}}
}
// SystemMessage creates a system prompt message. // SystemMessage creates a system prompt message.
func SystemMessage(text string) Message { func SystemMessage(text string) Message {
return Message{Role: RoleSystem, Content: Content{Text: text}} return Message{Role: RoleSystem, Content: Content{Text: text}}

View File

@@ -3,7 +3,11 @@ package openai
import ( import (
"context" "context"
"encoding/base64"
"fmt" "fmt"
"io"
"net/http"
"path"
"strings" "strings"
"github.com/openai/openai-go" "github.com/openai/openai-go"
@@ -206,6 +210,48 @@ func convertMessage(msg provider.Message, model string) openai.ChatCompletionMes
} }
} }
for _, aud := range msg.Audio {
var b64Data string
var format string
if aud.Base64 != "" {
b64Data = aud.Base64
format = audioFormat(aud.ContentType)
} else if aud.URL != "" {
resp, err := http.Get(aud.URL)
if err != nil {
continue
}
data, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
continue
}
b64Data = base64.StdEncoding.EncodeToString(data)
ct := resp.Header.Get("Content-Type")
if ct == "" {
ct = aud.ContentType
}
if ct == "" {
ct = audioFormatFromURL(aud.URL)
}
format = audioFormat(ct)
}
if b64Data != "" && format != "" {
arrayOfContentParts = append(arrayOfContentParts,
openai.ChatCompletionContentPartUnionParam{
OfInputAudio: &openai.ChatCompletionContentPartInputAudioParam{
InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{
Data: b64Data,
Format: format,
},
},
},
)
}
}
if msg.Content != "" { if msg.Content != "" {
if len(arrayOfContentParts) > 0 { if len(arrayOfContentParts) > 0 {
arrayOfContentParts = append(arrayOfContentParts, arrayOfContentParts = append(arrayOfContentParts,
@@ -321,3 +367,29 @@ func (p *Provider) convertResponse(resp *openai.ChatCompletion) provider.Respons
return res return res
} }
// audioFormat converts a MIME type to an OpenAI audio format string ("wav" or "mp3").
func audioFormat(contentType string) string {
ct := strings.ToLower(contentType)
switch {
case strings.Contains(ct, "wav"):
return "wav"
case strings.Contains(ct, "mp3"), strings.Contains(ct, "mpeg"):
return "mp3"
default:
return "wav"
}
}
// audioFormatFromURL guesses the audio format from a URL's file extension.
func audioFormatFromURL(u string) string {
ext := strings.ToLower(path.Ext(u))
switch ext {
case ".mp3":
return "audio/mp3"
case ".wav":
return "audio/wav"
default:
return "audio/wav"
}
}

View File

@@ -8,6 +8,7 @@ type Message struct {
Role string Role string
Content string Content string
Images []Image Images []Image
Audio []Audio
ToolCalls []ToolCall ToolCalls []ToolCall
ToolCallID string ToolCallID string
} }
@@ -19,6 +20,13 @@ type Image struct {
ContentType string ContentType string
} }
// Audio represents an audio attachment at the provider level.
type Audio struct {
URL string
Base64 string
ContentType string
}
// ToolCall represents a tool invocation requested by the model. // ToolCall represents a tool invocation requested by the model.
type ToolCall struct { type ToolCall struct {
ID string ID string