Add OpenAI-based transcriber implementation

- Introduce `openaiTranscriber` for integrating OpenAI's Whisper audio transcription capabilities.
- Define `Transcriber` interface and associated types (`Transcription`, `TranscriptionOptions`, segments, and words).
- Implement transcription logic supporting features like languages, prompts, temperature, and timestamp granularities.
- Add `audioFileToWav` utility using `ffmpeg` for audio file conversion to WAV format.
- Ensure response parsing for structured and verbose JSON outputs.
This commit is contained in:
2026-01-25 01:46:29 -05:00
parent 9c1b4f7e9f
commit 8801ce5945
2 changed files with 364 additions and 0 deletions

219
openai_transcriber.go Normal file
View File

@@ -0,0 +1,219 @@
package llm
import (
"bytes"
"context"
"encoding/json"
"fmt"
"strings"
"github.com/openai/openai-go"
"github.com/openai/openai-go/option"
)
type openaiTranscriber struct {
key string
model string
baseUrl string
}
var _ Transcriber = openaiTranscriber{}
// OpenAITranscriber creates a transcriber backed by OpenAI's audio models.
// If model is empty, whisper-1 is used by default.
func OpenAITranscriber(key string, model string) Transcriber {
if strings.TrimSpace(model) == "" {
model = "whisper-1"
}
return openaiTranscriber{
key: key,
model: model,
}
}
func (o openaiTranscriber) Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) {
if len(wav) == 0 {
return Transcription{}, fmt.Errorf("wav data is empty")
}
format := opts.ResponseFormat
if format == "" {
if strings.HasPrefix(o.model, "gpt-4o") {
format = TranscriptionResponseFormatJSON
} else {
format = TranscriptionResponseFormatVerboseJSON
}
}
if format != TranscriptionResponseFormatJSON && format != TranscriptionResponseFormatVerboseJSON {
return Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output")
}
if len(opts.TimestampGranularities) > 0 && format != TranscriptionResponseFormatVerboseJSON {
return Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json")
}
params := openai.AudioTranscriptionNewParams{
File: bytes.NewReader(wav),
Model: openai.AudioModel(o.model),
}
if opts.Language != "" {
params.Language = openai.String(opts.Language)
}
if opts.Prompt != "" {
params.Prompt = openai.String(opts.Prompt)
}
if opts.Temperature != nil {
params.Temperature = openai.Float(*opts.Temperature)
}
params.ResponseFormat = openai.AudioResponseFormat(format)
if opts.IncludeLogprobs {
params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs}
}
if len(opts.TimestampGranularities) > 0 {
for _, granularity := range opts.TimestampGranularities {
params.TimestampGranularities = append(params.TimestampGranularities, string(granularity))
}
}
clientOptions := []option.RequestOption{
option.WithAPIKey(o.key),
}
if o.baseUrl != "" {
clientOptions = append(clientOptions, option.WithBaseURL(o.baseUrl))
}
client := openai.NewClient(clientOptions...)
resp, err := client.Audio.Transcriptions.New(ctx, params)
if err != nil {
return Transcription{}, fmt.Errorf("openai transcription failed: %w", err)
}
return openaiTranscriptionToResult(o.model, resp), nil
}
type openaiVerboseTranscription struct {
Text string `json:"text"`
Language string `json:"language"`
Duration float64 `json:"duration"`
Segments []openaiVerboseSegment `json:"segments"`
Words []openaiVerboseWord `json:"words"`
}
type openaiVerboseSegment struct {
ID int `json:"id"`
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
AvgLogprob *float64 `json:"avg_logprob"`
CompressionRatio *float64 `json:"compression_ratio"`
NoSpeechProb *float64 `json:"no_speech_prob"`
Words []openaiVerboseWord `json:"words"`
}
type openaiVerboseWord struct {
Word string `json:"word"`
Start float64 `json:"start"`
End float64 `json:"end"`
}
func openaiTranscriptionToResult(model string, resp *openai.Transcription) Transcription {
result := Transcription{
Provider: "openai",
Model: model,
}
if resp == nil {
return result
}
result.Text = resp.Text
result.RawJSON = resp.RawJSON()
for _, logprob := range resp.Logprobs {
result.Logprobs = append(result.Logprobs, TranscriptionTokenLogprob{
Token: logprob.Token,
Bytes: logprob.Bytes,
Logprob: logprob.Logprob,
})
}
if usage := openaiUsageToTranscriptionUsage(resp.Usage); usage.Type != "" {
result.Usage = usage
}
if result.RawJSON == "" {
return result
}
var verbose openaiVerboseTranscription
if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil {
return result
}
if verbose.Text != "" {
result.Text = verbose.Text
}
result.Language = verbose.Language
result.DurationSeconds = verbose.Duration
for _, seg := range verbose.Segments {
segment := TranscriptionSegment{
ID: seg.ID,
Start: seg.Start,
End: seg.End,
Text: seg.Text,
Tokens: append([]int(nil), seg.Tokens...),
AvgLogprob: seg.AvgLogprob,
CompressionRatio: seg.CompressionRatio,
NoSpeechProb: seg.NoSpeechProb,
}
for _, word := range seg.Words {
segment.Words = append(segment.Words, TranscriptionWord{
Word: word.Word,
Start: word.Start,
End: word.End,
})
}
result.Segments = append(result.Segments, segment)
}
for _, word := range verbose.Words {
result.Words = append(result.Words, TranscriptionWord{
Word: word.Word,
Start: word.Start,
End: word.End,
})
}
return result
}
func openaiUsageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) TranscriptionUsage {
switch usage.Type {
case "tokens":
tokens := usage.AsTokens()
return TranscriptionUsage{
Type: usage.Type,
InputTokens: tokens.InputTokens,
OutputTokens: tokens.OutputTokens,
TotalTokens: tokens.TotalTokens,
AudioTokens: tokens.InputTokenDetails.AudioTokens,
TextTokens: tokens.InputTokenDetails.TextTokens,
}
case "duration":
duration := usage.AsDuration()
return TranscriptionUsage{
Type: usage.Type,
Seconds: duration.Seconds,
}
default:
return TranscriptionUsage{}
}
}

145
transcriber.go Normal file
View File

@@ -0,0 +1,145 @@
package llm
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
)
// Transcriber abstracts a speech-to-text model implementation.
type Transcriber interface {
Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
}
// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat string
const (
TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json"
TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
TranscriptionResponseFormatText TranscriptionResponseFormat = "text"
TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt"
TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt"
)
// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity string
const (
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)
// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions struct {
Language string
Prompt string
Temperature *float64
ResponseFormat TranscriptionResponseFormat
TimestampGranularities []TranscriptionTimestampGranularity
IncludeLogprobs bool
}
// Transcription captures a normalized transcription result.
type Transcription struct {
Provider string
Model string
Text string
Language string
DurationSeconds float64
Segments []TranscriptionSegment
Words []TranscriptionWord
Logprobs []TranscriptionTokenLogprob
Usage TranscriptionUsage
RawJSON string
}
// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment struct {
ID int
Start float64
End float64
Text string
Tokens []int
AvgLogprob *float64
CompressionRatio *float64
NoSpeechProb *float64
Words []TranscriptionWord
}
// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord struct {
Word string
Start float64
End float64
Confidence *float64
}
// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob struct {
Token string
Bytes []float64
Logprob float64
}
// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage struct {
Type string
InputTokens int64
OutputTokens int64
TotalTokens int64
AudioTokens int64
TextTokens int64
Seconds float64
}
// TranscribeFile converts an audio file to WAV and transcribes it.
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
if transcriber == nil {
return Transcription{}, fmt.Errorf("transcriber is nil")
}
wav, err := audioFileToWav(ctx, filename)
if err != nil {
return Transcription{}, err
}
return transcriber.Transcribe(ctx, wav, opts)
}
func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
if filename == "" {
return nil, fmt.Errorf("filename is empty")
}
if strings.EqualFold(filepath.Ext(filename), ".wav") {
data, err := os.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("read wav file: %w", err)
}
return data, nil
}
tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
if err != nil {
return nil, fmt.Errorf("create temp wav file: %w", err)
}
tempPath := tempFile.Name()
_ = tempFile.Close()
defer os.Remove(tempPath)
cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
if output, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(tempPath)
if err != nil {
return nil, fmt.Errorf("read converted wav file: %w", err)
}
return data, nil
}