package llm import ( "context" "fmt" "os" "os/exec" "path/filepath" "strings" ) // Transcriber abstracts a speech-to-text model implementation. type Transcriber interface { Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) } // TranscriptionResponseFormat controls the output format requested from a transcriber. type TranscriptionResponseFormat string const ( TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json" TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json" TranscriptionResponseFormatText TranscriptionResponseFormat = "text" TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt" TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt" ) // TranscriptionTimestampGranularity defines the requested timestamp detail. type TranscriptionTimestampGranularity string const ( TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word" TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment" ) // TranscriptionOptions configures transcription behavior. type TranscriptionOptions struct { Language string Prompt string Temperature *float64 ResponseFormat TranscriptionResponseFormat TimestampGranularities []TranscriptionTimestampGranularity IncludeLogprobs bool } // Transcription captures a normalized transcription result. type Transcription struct { Provider string Model string Text string Language string DurationSeconds float64 Segments []TranscriptionSegment Words []TranscriptionWord Logprobs []TranscriptionTokenLogprob Usage TranscriptionUsage RawJSON string } // TranscriptionSegment provides a coarse time-sliced transcription segment. type TranscriptionSegment struct { ID int Start float64 End float64 Text string Tokens []int AvgLogprob *float64 CompressionRatio *float64 NoSpeechProb *float64 Words []TranscriptionWord } // TranscriptionWord provides a word-level timestamp. type TranscriptionWord struct { Word string Start float64 End float64 Confidence *float64 } // TranscriptionTokenLogprob captures token-level log probability details. type TranscriptionTokenLogprob struct { Token string Bytes []float64 Logprob float64 } // TranscriptionUsage captures token or duration usage details. type TranscriptionUsage struct { Type string InputTokens int64 OutputTokens int64 TotalTokens int64 AudioTokens int64 TextTokens int64 Seconds float64 } // TranscribeFile converts an audio file to WAV and transcribes it. func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) { if transcriber == nil { return Transcription{}, fmt.Errorf("transcriber is nil") } wav, err := audioFileToWav(ctx, filename) if err != nil { return Transcription{}, err } return transcriber.Transcribe(ctx, wav, opts) } func audioFileToWav(ctx context.Context, filename string) ([]byte, error) { if filename == "" { return nil, fmt.Errorf("filename is empty") } if strings.EqualFold(filepath.Ext(filename), ".wav") { data, err := os.ReadFile(filename) if err != nil { return nil, fmt.Errorf("read wav file: %w", err) } return data, nil } tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav") if err != nil { return nil, fmt.Errorf("create temp wav file: %w", err) } tempPath := tempFile.Name() _ = tempFile.Close() defer os.Remove(tempPath) cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath) if output, err := cmd.CombinedOutput(); err != nil { return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output))) } data, err := os.ReadFile(tempPath) if err != nil { return nil, fmt.Errorf("read converted wav file: %w", err) } return data, nil }