Add OpenAI-based transcriber implementation

- Introduce `openaiTranscriber` for integrating OpenAI's Whisper audio transcription capabilities. - Define `Transcriber` interface and associated types (`Transcription`, `TranscriptionOptions`, segments, and words). - Implement transcription logic supporting features like languages, prompts, temperature, and timestamp granularities. - Add `audioFileToWav` utility using `ffmpeg` for audio file conversion to WAV format. - Ensure response parsing for structured and verbose JSON outputs.
2026-01-25 01:46:29 -05:00
parent 9c1b4f7e9f
commit 8801ce5945
2 changed files with 364 additions and 0 deletions
--- a/transcriber.go
+++ b/transcriber.go
@@ -0,0 +1,145 @@
+package llm
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+)
+
+// Transcriber abstracts a speech-to-text model implementation.
+type Transcriber interface {
+	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
+}
+
+// TranscriptionResponseFormat controls the output format requested from a transcriber.
+type TranscriptionResponseFormat string
+
+const (
+	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
+	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
+	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
+	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
+	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
+)
+
+// TranscriptionTimestampGranularity defines the requested timestamp detail.
+type TranscriptionTimestampGranularity string
+
+const (
+	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
+	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
+)
+
+// TranscriptionOptions configures transcription behavior.
+type TranscriptionOptions struct {
+	Language               string
+	Prompt                 string
+	Temperature            *float64
+	ResponseFormat         TranscriptionResponseFormat
+	TimestampGranularities []TranscriptionTimestampGranularity
+	IncludeLogprobs        bool
+}
+
+// Transcription captures a normalized transcription result.
+type Transcription struct {
+	Provider        string
+	Model           string
+	Text            string
+	Language        string
+	DurationSeconds float64
+	Segments        []TranscriptionSegment
+	Words           []TranscriptionWord
+	Logprobs        []TranscriptionTokenLogprob
+	Usage           TranscriptionUsage
+	RawJSON         string
+}
+
+// TranscriptionSegment provides a coarse time-sliced transcription segment.
+type TranscriptionSegment struct {
+	ID               int
+	Start            float64
+	End              float64
+	Text             string
+	Tokens           []int
+	AvgLogprob       *float64
+	CompressionRatio *float64
+	NoSpeechProb     *float64
+	Words            []TranscriptionWord
+}
+
+// TranscriptionWord provides a word-level timestamp.
+type TranscriptionWord struct {
+	Word       string
+	Start      float64
+	End        float64
+	Confidence *float64
+}
+
+// TranscriptionTokenLogprob captures token-level log probability details.
+type TranscriptionTokenLogprob struct {
+	Token   string
+	Bytes   []float64
+	Logprob float64
+}
+
+// TranscriptionUsage captures token or duration usage details.
+type TranscriptionUsage struct {
+	Type         string
+	InputTokens  int64
+	OutputTokens int64
+	TotalTokens  int64
+	AudioTokens  int64
+	TextTokens   int64
+	Seconds      float64
+}
+
+// TranscribeFile converts an audio file to WAV and transcribes it.
+func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
+	if transcriber == nil {
+		return Transcription{}, fmt.Errorf("transcriber is nil")
+	}
+
+	wav, err := audioFileToWav(ctx, filename)
+	if err != nil {
+		return Transcription{}, err
+	}
+
+	return transcriber.Transcribe(ctx, wav, opts)
+}
+
+func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
+	if filename == "" {
+		return nil, fmt.Errorf("filename is empty")
+	}
+
+	if strings.EqualFold(filepath.Ext(filename), ".wav") {
+		data, err := os.ReadFile(filename)
+		if err != nil {
+			return nil, fmt.Errorf("read wav file: %w", err)
+		}
+		return data, nil
+	}
+
+	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
+	if err != nil {
+		return nil, fmt.Errorf("create temp wav file: %w", err)
+	}
+	tempPath := tempFile.Name()
+	_ = tempFile.Close()
+	defer os.Remove(tempPath)
+
+	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
+	}
+
+	data, err := os.ReadFile(tempPath)
+	if err != nil {
+		return nil, fmt.Errorf("read converted wav file: %w", err)
+	}
+
+	return data, nil
+}