From 7e1705c385af42bbb3dfe8b8fffa59a61b072305 Mon Sep 17 00:00:00 2001
From: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Date: Sun, 8 Feb 2026 21:00:56 -0500
Subject: [PATCH] feat: add audio input support to v2 providers

Add Audio struct alongside Image for sending audio attachments to
multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3),
Google Gemini uses genai.NewPartFromBytes, and Anthropic skips
audio gracefully since it's not supported.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 v2/anthropic/anthropic.go |  2 ++
 v2/google/google.go       | 33 ++++++++++++++++++
 v2/llm.go                 |  7 ++++
 v2/message.go             | 16 ++++++++-
 v2/openai/openai.go       | 72 +++++++++++++++++++++++++++++++++++++++
 v2/provider/provider.go   |  8 +++++
 6 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/v2/anthropic/anthropic.go b/v2/anthropic/anthropic.go
index beaf64b..201092b 100644
--- a/v2/anthropic/anthropic.go
+++ b/v2/anthropic/anthropic.go
@@ -204,6 +204,8 @@ func (p *Provider) buildRequest(req provider.Request) anth.MessagesRequest {
 			}
 		}
 
+		// Audio is not supported by Anthropic — skip silently.
+
 		// Merge consecutive same-role messages (Anthropic requires alternating)
 		if len(msgs) > 0 && msgs[len(msgs)-1].Role == role {
 			msgs[len(msgs)-1].Content = append(msgs[len(msgs)-1].Content, m.Content...)
diff --git a/v2/google/google.go b/v2/google/google.go
index 180f721..2f99060 100644
--- a/v2/google/google.go
+++ b/v2/google/google.go
@@ -217,6 +217,39 @@ func (p *Provider) buildRequest(req provider.Request) ([]*genai.Content, *genai.
 			}
 		}
 
+		for _, aud := range msg.Audio {
+			if aud.URL != "" {
+				resp, err := http.Get(aud.URL)
+				if err != nil {
+					continue
+				}
+				data, err := io.ReadAll(resp.Body)
+				resp.Body.Close()
+				if err != nil {
+					continue
+				}
+
+				mimeType := resp.Header.Get("Content-Type")
+				if mimeType == "" {
+					mimeType = aud.ContentType
+				}
+				if mimeType == "" {
+					mimeType = "audio/wav"
+				}
+				parts = append(parts, genai.NewPartFromBytes(data, mimeType))
+			} else if aud.Base64 != "" {
+				data, err := base64.StdEncoding.DecodeString(aud.Base64)
+				if err != nil {
+					continue
+				}
+				ct := aud.ContentType
+				if ct == "" {
+					ct = "audio/wav"
+				}
+				parts = append(parts, genai.NewPartFromBytes(data, ct))
+			}
+		}
+
 		contents = append(contents, genai.NewContentFromParts(parts, role))
 	}
 
diff --git a/v2/llm.go b/v2/llm.go
index 74075ec..388565d 100644
--- a/v2/llm.go
+++ b/v2/llm.go
@@ -140,6 +140,13 @@ func convertMessages(msgs []Message) []provider.Message {
 				ContentType: img.ContentType,
 			})
 		}
+		for _, aud := range m.Content.Audio {
+			pm.Audio = append(pm.Audio, provider.Audio{
+				URL:         aud.URL,
+				Base64:      aud.Base64,
+				ContentType: aud.ContentType,
+			})
+		}
 		for _, tc := range m.ToolCalls {
 			pm.ToolCalls = append(pm.ToolCalls, provider.ToolCall{
 				ID:        tc.ID,
diff --git a/v2/message.go b/v2/message.go
index 43185b5..1a24148 100644
--- a/v2/message.go
+++ b/v2/message.go
@@ -18,10 +18,19 @@ type Image struct {
 	ContentType string // MIME type (e.g., "image/png"), required for Base64
 }
 
-// Content represents message content with optional text and images.
+// Audio represents an audio attachment.
+type Audio struct {
+	// Provide exactly one of URL or Base64.
+	URL         string // HTTP(S) URL to audio file
+	Base64      string // Raw base64-encoded audio data
+	ContentType string // MIME type (e.g., "audio/wav", "audio/mp3")
+}
+
+// Content represents message content with optional text, images, and audio.
 type Content struct {
 	Text   string
 	Images []Image
+	Audio  []Audio
 }
 
 // ToolCall represents a tool invocation requested by the assistant.
@@ -53,6 +62,11 @@ func UserMessageWithImages(text string, images ...Image) Message {
 	return Message{Role: RoleUser, Content: Content{Text: text, Images: images}}
 }
 
+// UserMessageWithAudio creates a user message with text and audio attachments.
+func UserMessageWithAudio(text string, audio ...Audio) Message {
+	return Message{Role: RoleUser, Content: Content{Text: text, Audio: audio}}
+}
+
 // SystemMessage creates a system prompt message.
 func SystemMessage(text string) Message {
 	return Message{Role: RoleSystem, Content: Content{Text: text}}
diff --git a/v2/openai/openai.go b/v2/openai/openai.go
index ab20adf..2676de6 100644
--- a/v2/openai/openai.go
+++ b/v2/openai/openai.go
@@ -3,7 +3,11 @@ package openai
 
 import (
 	"context"
+	"encoding/base64"
 	"fmt"
+	"io"
+	"net/http"
+	"path"
 	"strings"
 
 	"github.com/openai/openai-go"
@@ -206,6 +210,48 @@ func convertMessage(msg provider.Message, model string) openai.ChatCompletionMes
 		}
 	}
 
+	for _, aud := range msg.Audio {
+		var b64Data string
+		var format string
+
+		if aud.Base64 != "" {
+			b64Data = aud.Base64
+			format = audioFormat(aud.ContentType)
+		} else if aud.URL != "" {
+			resp, err := http.Get(aud.URL)
+			if err != nil {
+				continue
+			}
+			data, err := io.ReadAll(resp.Body)
+			resp.Body.Close()
+			if err != nil {
+				continue
+			}
+			b64Data = base64.StdEncoding.EncodeToString(data)
+			ct := resp.Header.Get("Content-Type")
+			if ct == "" {
+				ct = aud.ContentType
+			}
+			if ct == "" {
+				ct = audioFormatFromURL(aud.URL)
+			}
+			format = audioFormat(ct)
+		}
+
+		if b64Data != "" && format != "" {
+			arrayOfContentParts = append(arrayOfContentParts,
+				openai.ChatCompletionContentPartUnionParam{
+					OfInputAudio: &openai.ChatCompletionContentPartInputAudioParam{
+						InputAudio: openai.ChatCompletionContentPartInputAudioInputAudioParam{
+							Data:   b64Data,
+							Format: format,
+						},
+					},
+				},
+			)
+		}
+	}
+
 	if msg.Content != "" {
 		if len(arrayOfContentParts) > 0 {
 			arrayOfContentParts = append(arrayOfContentParts,
@@ -321,3 +367,29 @@ func (p *Provider) convertResponse(resp *openai.ChatCompletion) provider.Respons
 
 	return res
 }
+
+// audioFormat converts a MIME type to an OpenAI audio format string ("wav" or "mp3").
+func audioFormat(contentType string) string {
+	ct := strings.ToLower(contentType)
+	switch {
+	case strings.Contains(ct, "wav"):
+		return "wav"
+	case strings.Contains(ct, "mp3"), strings.Contains(ct, "mpeg"):
+		return "mp3"
+	default:
+		return "wav"
+	}
+}
+
+// audioFormatFromURL guesses the audio format from a URL's file extension.
+func audioFormatFromURL(u string) string {
+	ext := strings.ToLower(path.Ext(u))
+	switch ext {
+	case ".mp3":
+		return "audio/mp3"
+	case ".wav":
+		return "audio/wav"
+	default:
+		return "audio/wav"
+	}
+}
diff --git a/v2/provider/provider.go b/v2/provider/provider.go
index 02e79d7..084ee83 100644
--- a/v2/provider/provider.go
+++ b/v2/provider/provider.go
@@ -8,6 +8,7 @@ type Message struct {
 	Role       string
 	Content    string
 	Images     []Image
+	Audio      []Audio
 	ToolCalls  []ToolCall
 	ToolCallID string
 }
@@ -19,6 +20,13 @@ type Image struct {
 	ContentType string
 }
 
+// Audio represents an audio attachment at the provider level.
+type Audio struct {
+	URL         string
+	Base64      string
+	ContentType string
+}
+
 // ToolCall represents a tool invocation requested by the model.
 type ToolCall struct {
 	ID        string