package llm import ( "bytes" "context" "encoding/json" "fmt" "strings" "github.com/openai/openai-go" "github.com/openai/openai-go/option" ) type openaiTranscriber struct { key string model string baseUrl string } var _ Transcriber = openaiTranscriber{} // OpenAITranscriber creates a transcriber backed by OpenAI's audio models. // If model is empty, whisper-1 is used by default. func OpenAITranscriber(key string, model string) Transcriber { if strings.TrimSpace(model) == "" { model = "whisper-1" } return openaiTranscriber{ key: key, model: model, } } func (o openaiTranscriber) Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) { if len(wav) == 0 { return Transcription{}, fmt.Errorf("wav data is empty") } format := opts.ResponseFormat if format == "" { if strings.HasPrefix(o.model, "gpt-4o") { format = TranscriptionResponseFormatJSON } else { format = TranscriptionResponseFormatVerboseJSON } } if format != TranscriptionResponseFormatJSON && format != TranscriptionResponseFormatVerboseJSON { return Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output") } if len(opts.TimestampGranularities) > 0 && format != TranscriptionResponseFormatVerboseJSON { return Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json") } params := openai.AudioTranscriptionNewParams{ File: bytes.NewReader(wav), Model: openai.AudioModel(o.model), } if opts.Language != "" { params.Language = openai.String(opts.Language) } if opts.Prompt != "" { params.Prompt = openai.String(opts.Prompt) } if opts.Temperature != nil { params.Temperature = openai.Float(*opts.Temperature) } params.ResponseFormat = openai.AudioResponseFormat(format) if opts.IncludeLogprobs { params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs} } if len(opts.TimestampGranularities) > 0 { for _, granularity := range opts.TimestampGranularities { params.TimestampGranularities = append(params.TimestampGranularities, string(granularity)) } } clientOptions := []option.RequestOption{ option.WithAPIKey(o.key), } if o.baseUrl != "" { clientOptions = append(clientOptions, option.WithBaseURL(o.baseUrl)) } client := openai.NewClient(clientOptions...) resp, err := client.Audio.Transcriptions.New(ctx, params) if err != nil { return Transcription{}, fmt.Errorf("openai transcription failed: %w", err) } return openaiTranscriptionToResult(o.model, resp), nil } type openaiVerboseTranscription struct { Text string `json:"text"` Language string `json:"language"` Duration float64 `json:"duration"` Segments []openaiVerboseSegment `json:"segments"` Words []openaiVerboseWord `json:"words"` } type openaiVerboseSegment struct { ID int `json:"id"` Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` Tokens []int `json:"tokens"` AvgLogprob *float64 `json:"avg_logprob"` CompressionRatio *float64 `json:"compression_ratio"` NoSpeechProb *float64 `json:"no_speech_prob"` Words []openaiVerboseWord `json:"words"` } type openaiVerboseWord struct { Word string `json:"word"` Start float64 `json:"start"` End float64 `json:"end"` } func openaiTranscriptionToResult(model string, resp *openai.Transcription) Transcription { result := Transcription{ Provider: "openai", Model: model, } if resp == nil { return result } result.Text = resp.Text result.RawJSON = resp.RawJSON() for _, logprob := range resp.Logprobs { result.Logprobs = append(result.Logprobs, TranscriptionTokenLogprob{ Token: logprob.Token, Bytes: logprob.Bytes, Logprob: logprob.Logprob, }) } if usage := openaiUsageToTranscriptionUsage(resp.Usage); usage.Type != "" { result.Usage = usage } if result.RawJSON == "" { return result } var verbose openaiVerboseTranscription if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil { return result } if verbose.Text != "" { result.Text = verbose.Text } result.Language = verbose.Language result.DurationSeconds = verbose.Duration for _, seg := range verbose.Segments { segment := TranscriptionSegment{ ID: seg.ID, Start: seg.Start, End: seg.End, Text: seg.Text, Tokens: append([]int(nil), seg.Tokens...), AvgLogprob: seg.AvgLogprob, CompressionRatio: seg.CompressionRatio, NoSpeechProb: seg.NoSpeechProb, } for _, word := range seg.Words { segment.Words = append(segment.Words, TranscriptionWord{ Word: word.Word, Start: word.Start, End: word.End, }) } result.Segments = append(result.Segments, segment) } for _, word := range verbose.Words { result.Words = append(result.Words, TranscriptionWord{ Word: word.Word, Start: word.Start, End: word.End, }) } return result } func openaiUsageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) TranscriptionUsage { switch usage.Type { case "tokens": tokens := usage.AsTokens() return TranscriptionUsage{ Type: usage.Type, InputTokens: tokens.InputTokens, OutputTokens: tokens.OutputTokens, TotalTokens: tokens.TotalTokens, AudioTokens: tokens.InputTokenDetails.AudioTokens, TextTokens: tokens.InputTokenDetails.TextTokens, } case "duration": duration := usage.AsDuration() return TranscriptionUsage{ Type: usage.Type, Seconds: duration.Seconds, } default: return TranscriptionUsage{} } }