- Introduce `openaiTranscriber` for integrating OpenAI's Whisper audio transcription capabilities. - Define `Transcriber` interface and associated types (`Transcription`, `TranscriptionOptions`, segments, and words). - Implement transcription logic supporting features like languages, prompts, temperature, and timestamp granularities. - Add `audioFileToWav` utility using `ffmpeg` for audio file conversion to WAV format. - Ensure response parsing for structured and verbose JSON outputs.
146 lines
4.1 KiB
Go
146 lines
4.1 KiB
Go
package llm
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// Transcriber abstracts a speech-to-text model implementation.
|
|
type Transcriber interface {
|
|
Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
|
|
}
|
|
|
|
// TranscriptionResponseFormat controls the output format requested from a transcriber.
|
|
type TranscriptionResponseFormat string
|
|
|
|
const (
|
|
TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json"
|
|
TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
|
|
TranscriptionResponseFormatText TranscriptionResponseFormat = "text"
|
|
TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt"
|
|
TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt"
|
|
)
|
|
|
|
// TranscriptionTimestampGranularity defines the requested timestamp detail.
|
|
type TranscriptionTimestampGranularity string
|
|
|
|
const (
|
|
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
|
|
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
|
|
)
|
|
|
|
// TranscriptionOptions configures transcription behavior.
|
|
type TranscriptionOptions struct {
|
|
Language string
|
|
Prompt string
|
|
Temperature *float64
|
|
ResponseFormat TranscriptionResponseFormat
|
|
TimestampGranularities []TranscriptionTimestampGranularity
|
|
IncludeLogprobs bool
|
|
}
|
|
|
|
// Transcription captures a normalized transcription result.
|
|
type Transcription struct {
|
|
Provider string
|
|
Model string
|
|
Text string
|
|
Language string
|
|
DurationSeconds float64
|
|
Segments []TranscriptionSegment
|
|
Words []TranscriptionWord
|
|
Logprobs []TranscriptionTokenLogprob
|
|
Usage TranscriptionUsage
|
|
RawJSON string
|
|
}
|
|
|
|
// TranscriptionSegment provides a coarse time-sliced transcription segment.
|
|
type TranscriptionSegment struct {
|
|
ID int
|
|
Start float64
|
|
End float64
|
|
Text string
|
|
Tokens []int
|
|
AvgLogprob *float64
|
|
CompressionRatio *float64
|
|
NoSpeechProb *float64
|
|
Words []TranscriptionWord
|
|
}
|
|
|
|
// TranscriptionWord provides a word-level timestamp.
|
|
type TranscriptionWord struct {
|
|
Word string
|
|
Start float64
|
|
End float64
|
|
Confidence *float64
|
|
}
|
|
|
|
// TranscriptionTokenLogprob captures token-level log probability details.
|
|
type TranscriptionTokenLogprob struct {
|
|
Token string
|
|
Bytes []float64
|
|
Logprob float64
|
|
}
|
|
|
|
// TranscriptionUsage captures token or duration usage details.
|
|
type TranscriptionUsage struct {
|
|
Type string
|
|
InputTokens int64
|
|
OutputTokens int64
|
|
TotalTokens int64
|
|
AudioTokens int64
|
|
TextTokens int64
|
|
Seconds float64
|
|
}
|
|
|
|
// TranscribeFile converts an audio file to WAV and transcribes it.
|
|
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
|
|
if transcriber == nil {
|
|
return Transcription{}, fmt.Errorf("transcriber is nil")
|
|
}
|
|
|
|
wav, err := audioFileToWav(ctx, filename)
|
|
if err != nil {
|
|
return Transcription{}, err
|
|
}
|
|
|
|
return transcriber.Transcribe(ctx, wav, opts)
|
|
}
|
|
|
|
func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
|
|
if filename == "" {
|
|
return nil, fmt.Errorf("filename is empty")
|
|
}
|
|
|
|
if strings.EqualFold(filepath.Ext(filename), ".wav") {
|
|
data, err := os.ReadFile(filename)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read wav file: %w", err)
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create temp wav file: %w", err)
|
|
}
|
|
tempPath := tempFile.Name()
|
|
_ = tempFile.Close()
|
|
defer os.Remove(tempPath)
|
|
|
|
cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
|
|
if output, err := cmd.CombinedOutput(); err != nil {
|
|
return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
|
|
}
|
|
|
|
data, err := os.ReadFile(tempPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read converted wav file: %w", err)
|
|
}
|
|
|
|
return data, nil
|
|
}
|