Add transcription API to v2 module

Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 20:24:20 -05:00
parent 9d6d2c61c3
commit 9e288954f2
3 changed files with 420 additions and 0 deletions
--- a/v2/provider/transcription.go
+++ b/v2/provider/transcription.go
@@ -0,0 +1,90 @@
+package provider
+
+import "context"
+
+// Transcriber abstracts a speech-to-text model implementation.
+type Transcriber interface {
+	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
+}
+
+// TranscriptionResponseFormat controls the output format requested from a transcriber.
+type TranscriptionResponseFormat string
+
+const (
+	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
+	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
+	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
+	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
+	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
+)
+
+// TranscriptionTimestampGranularity defines the requested timestamp detail.
+type TranscriptionTimestampGranularity string
+
+const (
+	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
+	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
+)
+
+// TranscriptionOptions configures transcription behavior.
+type TranscriptionOptions struct {
+	Language               string
+	Prompt                 string
+	Temperature            *float64
+	ResponseFormat         TranscriptionResponseFormat
+	TimestampGranularities []TranscriptionTimestampGranularity
+	IncludeLogprobs        bool
+}
+
+// Transcription captures a normalized transcription result.
+type Transcription struct {
+	Provider        string
+	Model           string
+	Text            string
+	Language        string
+	DurationSeconds float64
+	Segments        []TranscriptionSegment
+	Words           []TranscriptionWord
+	Logprobs        []TranscriptionTokenLogprob
+	Usage           TranscriptionUsage
+	RawJSON         string
+}
+
+// TranscriptionSegment provides a coarse time-sliced transcription segment.
+type TranscriptionSegment struct {
+	ID               int
+	Start            float64
+	End              float64
+	Text             string
+	Tokens           []int
+	AvgLogprob       *float64
+	CompressionRatio *float64
+	NoSpeechProb     *float64
+	Words            []TranscriptionWord
+}
+
+// TranscriptionWord provides a word-level timestamp.
+type TranscriptionWord struct {
+	Word       string
+	Start      float64
+	End        float64
+	Confidence *float64
+}
+
+// TranscriptionTokenLogprob captures token-level log probability details.
+type TranscriptionTokenLogprob struct {
+	Token   string
+	Bytes   []float64
+	Logprob float64
+}
+
+// TranscriptionUsage captures token or duration usage details.
+type TranscriptionUsage struct {
+	Type         string
+	InputTokens  int64
+	OutputTokens int64
+	TotalTokens  int64
+	AudioTokens  int64
+	TextTokens   int64
+	Seconds      float64
+}