package openai import ( "bytes" "context" "encoding/json" "fmt" "strings" "github.com/openai/openai-go" "github.com/openai/openai-go/option" "gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider" ) // Transcriber implements the provider.Transcriber interface using OpenAI's audio models. type Transcriber struct { key string model string baseURL string } var _ provider.Transcriber = (*Transcriber)(nil) // NewTranscriber creates a transcriber backed by OpenAI's audio models. // If model is empty, "whisper-1" is used by default. func NewTranscriber(key string, model string) *Transcriber { if strings.TrimSpace(model) == "" { model = "whisper-1" } return &Transcriber{ key: key, model: model, } } // NewTranscriberWithBaseURL creates a transcriber with a custom API base URL. func NewTranscriberWithBaseURL(key, model, baseURL string) *Transcriber { t := NewTranscriber(key, model) t.baseURL = baseURL return t } // Transcribe performs speech-to-text transcription of WAV audio data. func (t *Transcriber) Transcribe(ctx context.Context, wav []byte, opts provider.TranscriptionOptions) (provider.Transcription, error) { if len(wav) == 0 { return provider.Transcription{}, fmt.Errorf("wav data is empty") } format := opts.ResponseFormat if format == "" { if strings.HasPrefix(t.model, "gpt-4o") { format = provider.TranscriptionResponseFormatJSON } else { format = provider.TranscriptionResponseFormatVerboseJSON } } if format != provider.TranscriptionResponseFormatJSON && format != provider.TranscriptionResponseFormatVerboseJSON { return provider.Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output") } if len(opts.TimestampGranularities) > 0 && format != provider.TranscriptionResponseFormatVerboseJSON { return provider.Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json") } params := openai.AudioTranscriptionNewParams{ File: openai.File(bytes.NewReader(wav), "audio.wav", "audio/wav"), Model: openai.AudioModel(t.model), } if opts.Language != "" { params.Language = openai.String(opts.Language) } if opts.Prompt != "" { params.Prompt = openai.String(opts.Prompt) } if opts.Temperature != nil { params.Temperature = openai.Float(*opts.Temperature) } params.ResponseFormat = openai.AudioResponseFormat(format) if opts.IncludeLogprobs { params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs} } if len(opts.TimestampGranularities) > 0 { for _, granularity := range opts.TimestampGranularities { params.TimestampGranularities = append(params.TimestampGranularities, string(granularity)) } } clientOptions := []option.RequestOption{ option.WithAPIKey(t.key), } if t.baseURL != "" { clientOptions = append(clientOptions, option.WithBaseURL(t.baseURL)) } client := openai.NewClient(clientOptions...) resp, err := client.Audio.Transcriptions.New(ctx, params) if err != nil { return provider.Transcription{}, fmt.Errorf("openai transcription failed: %w", err) } return transcriptionToResult(t.model, resp), nil } type verboseTranscription struct { Text string `json:"text"` Language string `json:"language"` Duration float64 `json:"duration"` Segments []verboseSegment `json:"segments"` Words []verboseWord `json:"words"` } type verboseSegment struct { ID int `json:"id"` Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` Tokens []int `json:"tokens"` AvgLogprob *float64 `json:"avg_logprob"` CompressionRatio *float64 `json:"compression_ratio"` NoSpeechProb *float64 `json:"no_speech_prob"` Words []verboseWord `json:"words"` } type verboseWord struct { Word string `json:"word"` Start float64 `json:"start"` End float64 `json:"end"` } func transcriptionToResult(model string, resp *openai.Transcription) provider.Transcription { result := provider.Transcription{ Provider: "openai", Model: model, } if resp == nil { return result } result.Text = resp.Text result.RawJSON = resp.RawJSON() for _, logprob := range resp.Logprobs { result.Logprobs = append(result.Logprobs, provider.TranscriptionTokenLogprob{ Token: logprob.Token, Bytes: logprob.Bytes, Logprob: logprob.Logprob, }) } if usage := usageToTranscriptionUsage(resp.Usage); usage.Type != "" { result.Usage = usage } if result.RawJSON == "" { return result } var verbose verboseTranscription if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil { return result } if verbose.Text != "" { result.Text = verbose.Text } result.Language = verbose.Language result.DurationSeconds = verbose.Duration for _, seg := range verbose.Segments { segment := provider.TranscriptionSegment{ ID: seg.ID, Start: seg.Start, End: seg.End, Text: seg.Text, Tokens: append([]int(nil), seg.Tokens...), AvgLogprob: seg.AvgLogprob, CompressionRatio: seg.CompressionRatio, NoSpeechProb: seg.NoSpeechProb, } for _, word := range seg.Words { segment.Words = append(segment.Words, provider.TranscriptionWord{ Word: word.Word, Start: word.Start, End: word.End, }) } result.Segments = append(result.Segments, segment) } for _, word := range verbose.Words { result.Words = append(result.Words, provider.TranscriptionWord{ Word: word.Word, Start: word.Start, End: word.End, }) } return result } func usageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) provider.TranscriptionUsage { switch usage.Type { case "tokens": tokens := usage.AsTokens() return provider.TranscriptionUsage{ Type: usage.Type, InputTokens: tokens.InputTokens, OutputTokens: tokens.OutputTokens, TotalTokens: tokens.TotalTokens, AudioTokens: tokens.InputTokenDetails.AudioTokens, TextTokens: tokens.InputTokenDetails.TextTokens, } case "duration": duration := usage.AsDuration() return provider.TranscriptionUsage{ Type: usage.Type, Seconds: duration.Seconds, } default: return provider.TranscriptionUsage{} } }