package llm import ( "context" "fmt" "os" "os/exec" "path/filepath" "strings" "gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider" ) // Transcriber abstracts a speech-to-text model implementation. type Transcriber = provider.Transcriber // TranscriptionResponseFormat controls the output format requested from a transcriber. type TranscriptionResponseFormat = provider.TranscriptionResponseFormat const ( TranscriptionResponseFormatJSON = provider.TranscriptionResponseFormatJSON TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormatText = provider.TranscriptionResponseFormatText TranscriptionResponseFormatSRT = provider.TranscriptionResponseFormatSRT TranscriptionResponseFormatVTT = provider.TranscriptionResponseFormatVTT ) // TranscriptionTimestampGranularity defines the requested timestamp detail. type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity const ( TranscriptionTimestampGranularityWord = provider.TranscriptionTimestampGranularityWord TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment ) // TranscriptionOptions configures transcription behavior. type TranscriptionOptions = provider.TranscriptionOptions // Transcription captures a normalized transcription result. type Transcription = provider.Transcription // TranscriptionSegment provides a coarse time-sliced transcription segment. type TranscriptionSegment = provider.TranscriptionSegment // TranscriptionWord provides a word-level timestamp. type TranscriptionWord = provider.TranscriptionWord // TranscriptionTokenLogprob captures token-level log probability details. type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob // TranscriptionUsage captures token or duration usage details. type TranscriptionUsage = provider.TranscriptionUsage // TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it. func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) { if transcriber == nil { return Transcription{}, fmt.Errorf("transcriber is nil") } wav, err := audioFileToWav(ctx, filename) if err != nil { return Transcription{}, err } return transcriber.Transcribe(ctx, wav, opts) } func audioFileToWav(ctx context.Context, filename string) ([]byte, error) { if filename == "" { return nil, fmt.Errorf("filename is empty") } if strings.EqualFold(filepath.Ext(filename), ".wav") { data, err := os.ReadFile(filename) if err != nil { return nil, fmt.Errorf("read wav file: %w", err) } return data, nil } tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav") if err != nil { return nil, fmt.Errorf("create temp wav file: %w", err) } tempPath := tempFile.Name() _ = tempFile.Close() defer os.Remove(tempPath) cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath) if output, err := cmd.CombinedOutput(); err != nil { return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output))) } data, err := os.ReadFile(tempPath) if err != nil { return nil, fmt.Errorf("read converted wav file: %w", err) } return data, nil }