From 9e288954f227eb2940a1a57a0be4f8e98def0450 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 7 Feb 2026 20:24:20 -0500 Subject: [PATCH] Add transcription API to v2 module Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 --- v2/openai/transcriber.go | 230 +++++++++++++++++++++++++++++++++++ v2/provider/transcription.go | 90 ++++++++++++++ v2/transcriber.go | 100 +++++++++++++++ 3 files changed, 420 insertions(+) create mode 100644 v2/openai/transcriber.go create mode 100644 v2/provider/transcription.go create mode 100644 v2/transcriber.go diff --git a/v2/openai/transcriber.go b/v2/openai/transcriber.go new file mode 100644 index 0000000..67e2496 --- /dev/null +++ b/v2/openai/transcriber.go @@ -0,0 +1,230 @@ +package openai + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/openai/openai-go" + "github.com/openai/openai-go/option" + + "gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider" +) + +// Transcriber implements the provider.Transcriber interface using OpenAI's audio models. +type Transcriber struct { + key string + model string + baseURL string +} + +var _ provider.Transcriber = (*Transcriber)(nil) + +// NewTranscriber creates a transcriber backed by OpenAI's audio models. +// If model is empty, "whisper-1" is used by default. +func NewTranscriber(key string, model string) *Transcriber { + if strings.TrimSpace(model) == "" { + model = "whisper-1" + } + return &Transcriber{ + key: key, + model: model, + } +} + +// NewTranscriberWithBaseURL creates a transcriber with a custom API base URL. +func NewTranscriberWithBaseURL(key, model, baseURL string) *Transcriber { + t := NewTranscriber(key, model) + t.baseURL = baseURL + return t +} + +// Transcribe performs speech-to-text transcription of WAV audio data. +func (t *Transcriber) Transcribe(ctx context.Context, wav []byte, opts provider.TranscriptionOptions) (provider.Transcription, error) { + if len(wav) == 0 { + return provider.Transcription{}, fmt.Errorf("wav data is empty") + } + + format := opts.ResponseFormat + if format == "" { + if strings.HasPrefix(t.model, "gpt-4o") { + format = provider.TranscriptionResponseFormatJSON + } else { + format = provider.TranscriptionResponseFormatVerboseJSON + } + } + + if format != provider.TranscriptionResponseFormatJSON && format != provider.TranscriptionResponseFormatVerboseJSON { + return provider.Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output") + } + + if len(opts.TimestampGranularities) > 0 && format != provider.TranscriptionResponseFormatVerboseJSON { + return provider.Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json") + } + + params := openai.AudioTranscriptionNewParams{ + File: openai.File(bytes.NewReader(wav), "audio.wav", "audio/wav"), + Model: openai.AudioModel(t.model), + } + + if opts.Language != "" { + params.Language = openai.String(opts.Language) + } + if opts.Prompt != "" { + params.Prompt = openai.String(opts.Prompt) + } + if opts.Temperature != nil { + params.Temperature = openai.Float(*opts.Temperature) + } + + params.ResponseFormat = openai.AudioResponseFormat(format) + + if opts.IncludeLogprobs { + params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs} + } + + if len(opts.TimestampGranularities) > 0 { + for _, granularity := range opts.TimestampGranularities { + params.TimestampGranularities = append(params.TimestampGranularities, string(granularity)) + } + } + + clientOptions := []option.RequestOption{ + option.WithAPIKey(t.key), + } + if t.baseURL != "" { + clientOptions = append(clientOptions, option.WithBaseURL(t.baseURL)) + } + + client := openai.NewClient(clientOptions...) + resp, err := client.Audio.Transcriptions.New(ctx, params) + if err != nil { + return provider.Transcription{}, fmt.Errorf("openai transcription failed: %w", err) + } + + return transcriptionToResult(t.model, resp), nil +} + +type verboseTranscription struct { + Text string `json:"text"` + Language string `json:"language"` + Duration float64 `json:"duration"` + Segments []verboseSegment `json:"segments"` + Words []verboseWord `json:"words"` +} + +type verboseSegment struct { + ID int `json:"id"` + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` + Tokens []int `json:"tokens"` + AvgLogprob *float64 `json:"avg_logprob"` + CompressionRatio *float64 `json:"compression_ratio"` + NoSpeechProb *float64 `json:"no_speech_prob"` + Words []verboseWord `json:"words"` +} + +type verboseWord struct { + Word string `json:"word"` + Start float64 `json:"start"` + End float64 `json:"end"` +} + +func transcriptionToResult(model string, resp *openai.Transcription) provider.Transcription { + result := provider.Transcription{ + Provider: "openai", + Model: model, + } + if resp == nil { + return result + } + + result.Text = resp.Text + result.RawJSON = resp.RawJSON() + + for _, logprob := range resp.Logprobs { + result.Logprobs = append(result.Logprobs, provider.TranscriptionTokenLogprob{ + Token: logprob.Token, + Bytes: logprob.Bytes, + Logprob: logprob.Logprob, + }) + } + + if usage := usageToTranscriptionUsage(resp.Usage); usage.Type != "" { + result.Usage = usage + } + + if result.RawJSON == "" { + return result + } + + var verbose verboseTranscription + if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil { + return result + } + + if verbose.Text != "" { + result.Text = verbose.Text + } + result.Language = verbose.Language + result.DurationSeconds = verbose.Duration + + for _, seg := range verbose.Segments { + segment := provider.TranscriptionSegment{ + ID: seg.ID, + Start: seg.Start, + End: seg.End, + Text: seg.Text, + Tokens: append([]int(nil), seg.Tokens...), + AvgLogprob: seg.AvgLogprob, + CompressionRatio: seg.CompressionRatio, + NoSpeechProb: seg.NoSpeechProb, + } + + for _, word := range seg.Words { + segment.Words = append(segment.Words, provider.TranscriptionWord{ + Word: word.Word, + Start: word.Start, + End: word.End, + }) + } + + result.Segments = append(result.Segments, segment) + } + + for _, word := range verbose.Words { + result.Words = append(result.Words, provider.TranscriptionWord{ + Word: word.Word, + Start: word.Start, + End: word.End, + }) + } + + return result +} + +func usageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) provider.TranscriptionUsage { + switch usage.Type { + case "tokens": + tokens := usage.AsTokens() + return provider.TranscriptionUsage{ + Type: usage.Type, + InputTokens: tokens.InputTokens, + OutputTokens: tokens.OutputTokens, + TotalTokens: tokens.TotalTokens, + AudioTokens: tokens.InputTokenDetails.AudioTokens, + TextTokens: tokens.InputTokenDetails.TextTokens, + } + case "duration": + duration := usage.AsDuration() + return provider.TranscriptionUsage{ + Type: usage.Type, + Seconds: duration.Seconds, + } + default: + return provider.TranscriptionUsage{} + } +} diff --git a/v2/provider/transcription.go b/v2/provider/transcription.go new file mode 100644 index 0000000..4c09019 --- /dev/null +++ b/v2/provider/transcription.go @@ -0,0 +1,90 @@ +package provider + +import "context" + +// Transcriber abstracts a speech-to-text model implementation. +type Transcriber interface { + Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) +} + +// TranscriptionResponseFormat controls the output format requested from a transcriber. +type TranscriptionResponseFormat string + +const ( + TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json" + TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json" + TranscriptionResponseFormatText TranscriptionResponseFormat = "text" + TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt" + TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt" +) + +// TranscriptionTimestampGranularity defines the requested timestamp detail. +type TranscriptionTimestampGranularity string + +const ( + TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word" + TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment" +) + +// TranscriptionOptions configures transcription behavior. +type TranscriptionOptions struct { + Language string + Prompt string + Temperature *float64 + ResponseFormat TranscriptionResponseFormat + TimestampGranularities []TranscriptionTimestampGranularity + IncludeLogprobs bool +} + +// Transcription captures a normalized transcription result. +type Transcription struct { + Provider string + Model string + Text string + Language string + DurationSeconds float64 + Segments []TranscriptionSegment + Words []TranscriptionWord + Logprobs []TranscriptionTokenLogprob + Usage TranscriptionUsage + RawJSON string +} + +// TranscriptionSegment provides a coarse time-sliced transcription segment. +type TranscriptionSegment struct { + ID int + Start float64 + End float64 + Text string + Tokens []int + AvgLogprob *float64 + CompressionRatio *float64 + NoSpeechProb *float64 + Words []TranscriptionWord +} + +// TranscriptionWord provides a word-level timestamp. +type TranscriptionWord struct { + Word string + Start float64 + End float64 + Confidence *float64 +} + +// TranscriptionTokenLogprob captures token-level log probability details. +type TranscriptionTokenLogprob struct { + Token string + Bytes []float64 + Logprob float64 +} + +// TranscriptionUsage captures token or duration usage details. +type TranscriptionUsage struct { + Type string + InputTokens int64 + OutputTokens int64 + TotalTokens int64 + AudioTokens int64 + TextTokens int64 + Seconds float64 +} diff --git a/v2/transcriber.go b/v2/transcriber.go new file mode 100644 index 0000000..579dcb1 --- /dev/null +++ b/v2/transcriber.go @@ -0,0 +1,100 @@ +package llm + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider" +) + +// Transcriber abstracts a speech-to-text model implementation. +type Transcriber = provider.Transcriber + +// TranscriptionResponseFormat controls the output format requested from a transcriber. +type TranscriptionResponseFormat = provider.TranscriptionResponseFormat + +const ( + TranscriptionResponseFormatJSON = provider.TranscriptionResponseFormatJSON + TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON + TranscriptionResponseFormatText = provider.TranscriptionResponseFormatText + TranscriptionResponseFormatSRT = provider.TranscriptionResponseFormatSRT + TranscriptionResponseFormatVTT = provider.TranscriptionResponseFormatVTT +) + +// TranscriptionTimestampGranularity defines the requested timestamp detail. +type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity + +const ( + TranscriptionTimestampGranularityWord = provider.TranscriptionTimestampGranularityWord + TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment +) + +// TranscriptionOptions configures transcription behavior. +type TranscriptionOptions = provider.TranscriptionOptions + +// Transcription captures a normalized transcription result. +type Transcription = provider.Transcription + +// TranscriptionSegment provides a coarse time-sliced transcription segment. +type TranscriptionSegment = provider.TranscriptionSegment + +// TranscriptionWord provides a word-level timestamp. +type TranscriptionWord = provider.TranscriptionWord + +// TranscriptionTokenLogprob captures token-level log probability details. +type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob + +// TranscriptionUsage captures token or duration usage details. +type TranscriptionUsage = provider.TranscriptionUsage + +// TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it. +func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) { + if transcriber == nil { + return Transcription{}, fmt.Errorf("transcriber is nil") + } + + wav, err := audioFileToWav(ctx, filename) + if err != nil { + return Transcription{}, err + } + + return transcriber.Transcribe(ctx, wav, opts) +} + +func audioFileToWav(ctx context.Context, filename string) ([]byte, error) { + if filename == "" { + return nil, fmt.Errorf("filename is empty") + } + + if strings.EqualFold(filepath.Ext(filename), ".wav") { + data, err := os.ReadFile(filename) + if err != nil { + return nil, fmt.Errorf("read wav file: %w", err) + } + return data, nil + } + + tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav") + if err != nil { + return nil, fmt.Errorf("create temp wav file: %w", err) + } + tempPath := tempFile.Name() + _ = tempFile.Close() + defer os.Remove(tempPath) + + cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath) + if output, err := cmd.CombinedOutput(); err != nil { + return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output))) + } + + data, err := os.ReadFile(tempPath) + if err != nil { + return nil, fmt.Errorf("read converted wav file: %w", err) + } + + return data, nil +}