Files
go-llm/v2/transcriber.go
Steve Dudenhoeffer 9e288954f2
Some checks failed
CI / Lint (push) Failing after 5m0s
CI / Root Module (push) Failing after 5m3s
CI / V2 Module (push) Successful in 10m48s
Add transcription API to v2 module
Migrate speech-to-text transcription types and OpenAI transcriber
implementation from v1. Types are defined in provider/ to avoid
import cycles and re-exported via type aliases from the root package.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 20:24:20 -05:00

101 lines
3.3 KiB
Go

package llm
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
)
// Transcriber abstracts a speech-to-text model implementation.
type Transcriber = provider.Transcriber
// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat = provider.TranscriptionResponseFormat
const (
TranscriptionResponseFormatJSON = provider.TranscriptionResponseFormatJSON
TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON
TranscriptionResponseFormatText = provider.TranscriptionResponseFormatText
TranscriptionResponseFormatSRT = provider.TranscriptionResponseFormatSRT
TranscriptionResponseFormatVTT = provider.TranscriptionResponseFormatVTT
)
// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity
const (
TranscriptionTimestampGranularityWord = provider.TranscriptionTimestampGranularityWord
TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment
)
// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions = provider.TranscriptionOptions
// Transcription captures a normalized transcription result.
type Transcription = provider.Transcription
// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment = provider.TranscriptionSegment
// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord = provider.TranscriptionWord
// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob
// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage = provider.TranscriptionUsage
// TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it.
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
if transcriber == nil {
return Transcription{}, fmt.Errorf("transcriber is nil")
}
wav, err := audioFileToWav(ctx, filename)
if err != nil {
return Transcription{}, err
}
return transcriber.Transcribe(ctx, wav, opts)
}
func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
if filename == "" {
return nil, fmt.Errorf("filename is empty")
}
if strings.EqualFold(filepath.Ext(filename), ".wav") {
data, err := os.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("read wav file: %w", err)
}
return data, nil
}
tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
if err != nil {
return nil, fmt.Errorf("create temp wav file: %w", err)
}
tempPath := tempFile.Name()
_ = tempFile.Close()
defer os.Remove(tempPath)
cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
if output, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(tempPath)
if err != nil {
return nil, fmt.Errorf("read converted wav file: %w", err)
}
return data, nil
}