Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
231 lines
6.2 KiB
Go
231 lines
6.2 KiB
Go
package openai
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/openai/openai-go"
|
|
"github.com/openai/openai-go/option"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
|
|
)
|
|
|
|
// Transcriber implements the provider.Transcriber interface using OpenAI's audio models.
|
|
type Transcriber struct {
|
|
key string
|
|
model string
|
|
baseURL string
|
|
}
|
|
|
|
var _ provider.Transcriber = (*Transcriber)(nil)
|
|
|
|
// NewTranscriber creates a transcriber backed by OpenAI's audio models.
|
|
// If model is empty, "whisper-1" is used by default.
|
|
func NewTranscriber(key string, model string) *Transcriber {
|
|
if strings.TrimSpace(model) == "" {
|
|
model = "whisper-1"
|
|
}
|
|
return &Transcriber{
|
|
key: key,
|
|
model: model,
|
|
}
|
|
}
|
|
|
|
// NewTranscriberWithBaseURL creates a transcriber with a custom API base URL.
|
|
func NewTranscriberWithBaseURL(key, model, baseURL string) *Transcriber {
|
|
t := NewTranscriber(key, model)
|
|
t.baseURL = baseURL
|
|
return t
|
|
}
|
|
|
|
// Transcribe performs speech-to-text transcription of WAV audio data.
|
|
func (t *Transcriber) Transcribe(ctx context.Context, wav []byte, opts provider.TranscriptionOptions) (provider.Transcription, error) {
|
|
if len(wav) == 0 {
|
|
return provider.Transcription{}, fmt.Errorf("wav data is empty")
|
|
}
|
|
|
|
format := opts.ResponseFormat
|
|
if format == "" {
|
|
if strings.HasPrefix(t.model, "gpt-4o") {
|
|
format = provider.TranscriptionResponseFormatJSON
|
|
} else {
|
|
format = provider.TranscriptionResponseFormatVerboseJSON
|
|
}
|
|
}
|
|
|
|
if format != provider.TranscriptionResponseFormatJSON && format != provider.TranscriptionResponseFormatVerboseJSON {
|
|
return provider.Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output")
|
|
}
|
|
|
|
if len(opts.TimestampGranularities) > 0 && format != provider.TranscriptionResponseFormatVerboseJSON {
|
|
return provider.Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json")
|
|
}
|
|
|
|
params := openai.AudioTranscriptionNewParams{
|
|
File: openai.File(bytes.NewReader(wav), "audio.wav", "audio/wav"),
|
|
Model: openai.AudioModel(t.model),
|
|
}
|
|
|
|
if opts.Language != "" {
|
|
params.Language = openai.String(opts.Language)
|
|
}
|
|
if opts.Prompt != "" {
|
|
params.Prompt = openai.String(opts.Prompt)
|
|
}
|
|
if opts.Temperature != nil {
|
|
params.Temperature = openai.Float(*opts.Temperature)
|
|
}
|
|
|
|
params.ResponseFormat = openai.AudioResponseFormat(format)
|
|
|
|
if opts.IncludeLogprobs {
|
|
params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs}
|
|
}
|
|
|
|
if len(opts.TimestampGranularities) > 0 {
|
|
for _, granularity := range opts.TimestampGranularities {
|
|
params.TimestampGranularities = append(params.TimestampGranularities, string(granularity))
|
|
}
|
|
}
|
|
|
|
clientOptions := []option.RequestOption{
|
|
option.WithAPIKey(t.key),
|
|
}
|
|
if t.baseURL != "" {
|
|
clientOptions = append(clientOptions, option.WithBaseURL(t.baseURL))
|
|
}
|
|
|
|
client := openai.NewClient(clientOptions...)
|
|
resp, err := client.Audio.Transcriptions.New(ctx, params)
|
|
if err != nil {
|
|
return provider.Transcription{}, fmt.Errorf("openai transcription failed: %w", err)
|
|
}
|
|
|
|
return transcriptionToResult(t.model, resp), nil
|
|
}
|
|
|
|
type verboseTranscription struct {
|
|
Text string `json:"text"`
|
|
Language string `json:"language"`
|
|
Duration float64 `json:"duration"`
|
|
Segments []verboseSegment `json:"segments"`
|
|
Words []verboseWord `json:"words"`
|
|
}
|
|
|
|
type verboseSegment struct {
|
|
ID int `json:"id"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Tokens []int `json:"tokens"`
|
|
AvgLogprob *float64 `json:"avg_logprob"`
|
|
CompressionRatio *float64 `json:"compression_ratio"`
|
|
NoSpeechProb *float64 `json:"no_speech_prob"`
|
|
Words []verboseWord `json:"words"`
|
|
}
|
|
|
|
type verboseWord struct {
|
|
Word string `json:"word"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
}
|
|
|
|
func transcriptionToResult(model string, resp *openai.Transcription) provider.Transcription {
|
|
result := provider.Transcription{
|
|
Provider: "openai",
|
|
Model: model,
|
|
}
|
|
if resp == nil {
|
|
return result
|
|
}
|
|
|
|
result.Text = resp.Text
|
|
result.RawJSON = resp.RawJSON()
|
|
|
|
for _, logprob := range resp.Logprobs {
|
|
result.Logprobs = append(result.Logprobs, provider.TranscriptionTokenLogprob{
|
|
Token: logprob.Token,
|
|
Bytes: logprob.Bytes,
|
|
Logprob: logprob.Logprob,
|
|
})
|
|
}
|
|
|
|
if usage := usageToTranscriptionUsage(resp.Usage); usage.Type != "" {
|
|
result.Usage = usage
|
|
}
|
|
|
|
if result.RawJSON == "" {
|
|
return result
|
|
}
|
|
|
|
var verbose verboseTranscription
|
|
if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil {
|
|
return result
|
|
}
|
|
|
|
if verbose.Text != "" {
|
|
result.Text = verbose.Text
|
|
}
|
|
result.Language = verbose.Language
|
|
result.DurationSeconds = verbose.Duration
|
|
|
|
for _, seg := range verbose.Segments {
|
|
segment := provider.TranscriptionSegment{
|
|
ID: seg.ID,
|
|
Start: seg.Start,
|
|
End: seg.End,
|
|
Text: seg.Text,
|
|
Tokens: append([]int(nil), seg.Tokens...),
|
|
AvgLogprob: seg.AvgLogprob,
|
|
CompressionRatio: seg.CompressionRatio,
|
|
NoSpeechProb: seg.NoSpeechProb,
|
|
}
|
|
|
|
for _, word := range seg.Words {
|
|
segment.Words = append(segment.Words, provider.TranscriptionWord{
|
|
Word: word.Word,
|
|
Start: word.Start,
|
|
End: word.End,
|
|
})
|
|
}
|
|
|
|
result.Segments = append(result.Segments, segment)
|
|
}
|
|
|
|
for _, word := range verbose.Words {
|
|
result.Words = append(result.Words, provider.TranscriptionWord{
|
|
Word: word.Word,
|
|
Start: word.Start,
|
|
End: word.End,
|
|
})
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func usageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) provider.TranscriptionUsage {
|
|
switch usage.Type {
|
|
case "tokens":
|
|
tokens := usage.AsTokens()
|
|
return provider.TranscriptionUsage{
|
|
Type: usage.Type,
|
|
InputTokens: tokens.InputTokens,
|
|
OutputTokens: tokens.OutputTokens,
|
|
TotalTokens: tokens.TotalTokens,
|
|
AudioTokens: tokens.InputTokenDetails.AudioTokens,
|
|
TextTokens: tokens.InputTokenDetails.TextTokens,
|
|
}
|
|
case "duration":
|
|
duration := usage.AsDuration()
|
|
return provider.TranscriptionUsage{
|
|
Type: usage.Type,
|
|
Seconds: duration.Seconds,
|
|
}
|
|
default:
|
|
return provider.TranscriptionUsage{}
|
|
}
|
|
}
|