Files
go-llm/transcriber.go
Steve Dudenhoeffer 8801ce5945 Add OpenAI-based transcriber implementation
- Introduce `openaiTranscriber` for integrating OpenAI's Whisper audio transcription capabilities.
- Define `Transcriber` interface and associated types (`Transcription`, `TranscriptionOptions`, segments, and words).
- Implement transcription logic supporting features like languages, prompts, temperature, and timestamp granularities.
- Add `audioFileToWav` utility using `ffmpeg` for audio file conversion to WAV format.
- Ensure response parsing for structured and verbose JSON outputs.
2026-01-25 01:46:29 -05:00

146 lines
4.1 KiB
Go

package llm
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
)
// Transcriber abstracts a speech-to-text model implementation.
type Transcriber interface {
Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
}
// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat string
const (
TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json"
TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
TranscriptionResponseFormatText TranscriptionResponseFormat = "text"
TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt"
TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt"
)
// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity string
const (
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)
// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions struct {
Language string
Prompt string
Temperature *float64
ResponseFormat TranscriptionResponseFormat
TimestampGranularities []TranscriptionTimestampGranularity
IncludeLogprobs bool
}
// Transcription captures a normalized transcription result.
type Transcription struct {
Provider string
Model string
Text string
Language string
DurationSeconds float64
Segments []TranscriptionSegment
Words []TranscriptionWord
Logprobs []TranscriptionTokenLogprob
Usage TranscriptionUsage
RawJSON string
}
// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment struct {
ID int
Start float64
End float64
Text string
Tokens []int
AvgLogprob *float64
CompressionRatio *float64
NoSpeechProb *float64
Words []TranscriptionWord
}
// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord struct {
Word string
Start float64
End float64
Confidence *float64
}
// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob struct {
Token string
Bytes []float64
Logprob float64
}
// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage struct {
Type string
InputTokens int64
OutputTokens int64
TotalTokens int64
AudioTokens int64
TextTokens int64
Seconds float64
}
// TranscribeFile converts an audio file to WAV and transcribes it.
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
if transcriber == nil {
return Transcription{}, fmt.Errorf("transcriber is nil")
}
wav, err := audioFileToWav(ctx, filename)
if err != nil {
return Transcription{}, err
}
return transcriber.Transcribe(ctx, wav, opts)
}
func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
if filename == "" {
return nil, fmt.Errorf("filename is empty")
}
if strings.EqualFold(filepath.Ext(filename), ".wav") {
data, err := os.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("read wav file: %w", err)
}
return data, nil
}
tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
if err != nil {
return nil, fmt.Errorf("create temp wav file: %w", err)
}
tempPath := tempFile.Name()
_ = tempFile.Close()
defer os.Remove(tempPath)
cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
if output, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(tempPath)
if err != nil {
return nil, fmt.Errorf("read converted wav file: %w", err)
}
return data, nil
}