feat: add audio input support to v2 providers

Add Audio struct alongside Image for sending audio attachments to multimodal LLMs. OpenAI uses input_audio content parts (wav/mp3), Google Gemini uses genai.NewPartFromBytes, and Anthropic skips audio gracefully since it's not supported. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 21:00:56 -05:00
parent fc2218b5fe
commit 7e1705c385
6 changed files with 137 additions and 1 deletions
--- a/v2/google/google.go
+++ b/v2/google/google.go
@@ -217,6 +217,39 @@ func (p *Provider) buildRequest(req provider.Request) ([]*genai.Content, *genai.
 			}
 		}

+		for _, aud := range msg.Audio {
+			if aud.URL != "" {
+				resp, err := http.Get(aud.URL)
+				if err != nil {
+					continue
+				}
+				data, err := io.ReadAll(resp.Body)
+				resp.Body.Close()
+				if err != nil {
+					continue
+				}
+
+				mimeType := resp.Header.Get("Content-Type")
+				if mimeType == "" {
+					mimeType = aud.ContentType
+				}
+				if mimeType == "" {
+					mimeType = "audio/wav"
+				}
+				parts = append(parts, genai.NewPartFromBytes(data, mimeType))
+			} else if aud.Base64 != "" {
+				data, err := base64.StdEncoding.DecodeString(aud.Base64)
+				if err != nil {
+					continue
+				}
+				ct := aud.ContentType
+				if ct == "" {
+					ct = "audio/wav"
+				}
+				parts = append(parts, genai.NewPartFromBytes(data, ct))
+			}
+		}
+
 		contents = append(contents, genai.NewContentFromParts(parts, role))
 	}