feat: add audio input support to v2 providers
Add Audio struct alongside Image for sending audio attachments to multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3), Google Gemini uses genai.NewPartFromBytes, and Anthropic skips audio gracefully since it's not supported. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,11 @@ package openai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/openai/openai-go"
|
||||
@@ -206,6 +210,48 @@ func convertMessage(msg provider.Message, model string) openai.ChatCompletionMes
|
||||
}
|
||||
}
|
||||
|
||||
for _, aud := range msg.Audio {
|
||||
var b64Data string
|
||||
var format string
|
||||
|
||||
if aud.Base64 != "" {
|
||||
b64Data = aud.Base64
|
||||
format = audioFormat(aud.ContentType)
|
||||
} else if aud.URL != "" {
|
||||
resp, err := http.Get(aud.URL)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
b64Data = base64.StdEncoding.EncodeToString(data)
|
||||
ct := resp.Header.Get("Content-Type")
|
||||
if ct == "" {
|
||||
ct = aud.ContentType
|
||||
}
|
||||
if ct == "" {
|
||||
ct = audioFormatFromURL(aud.URL)
|
||||
}
|
||||
format = audioFormat(ct)
|
||||
}
|
||||
|
||||
if b64Data != "" && format != "" {
|
||||
arrayOfContentParts = append(arrayOfContentParts,
|
||||
openai.ChatCompletionContentPartUnionParam{
|
||||
OfInputAudio: &openai.ChatCompletionContentPartInputAudioParam{
|
||||
InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{
|
||||
Data: b64Data,
|
||||
Format: format,
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if msg.Content != "" {
|
||||
if len(arrayOfContentParts) > 0 {
|
||||
arrayOfContentParts = append(arrayOfContentParts,
|
||||
@@ -321,3 +367,29 @@ func (p *Provider) convertResponse(resp *openai.ChatCompletion) provider.Respons
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
// audioFormat converts a MIME type to an OpenAI audio format string ("wav" or "mp3").
|
||||
func audioFormat(contentType string) string {
|
||||
ct := strings.ToLower(contentType)
|
||||
switch {
|
||||
case strings.Contains(ct, "wav"):
|
||||
return "wav"
|
||||
case strings.Contains(ct, "mp3"), strings.Contains(ct, "mpeg"):
|
||||
return "mp3"
|
||||
default:
|
||||
return "wav"
|
||||
}
|
||||
}
|
||||
|
||||
// audioFormatFromURL guesses the audio format from a URL's file extension.
|
||||
func audioFormatFromURL(u string) string {
|
||||
ext := strings.ToLower(path.Ext(u))
|
||||
switch ext {
|
||||
case ".mp3":
|
||||
return "audio/mp3"
|
||||
case ".wav":
|
||||
return "audio/wav"
|
||||
default:
|
||||
return "audio/wav"
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user