Files
go-llm/v2/provider/transcription.go
Steve Dudenhoeffer 9e288954f2
Some checks failed
CI / Lint (push) Failing after 5m0s
CI / Root Module (push) Failing after 5m3s
CI / V2 Module (push) Successful in 10m48s
Add transcription API to v2 module
Migrate speech-to-text transcription types and OpenAI transcriber
implementation from v1. Types are defined in provider/ to avoid
import cycles and re-exported via type aliases from the root package.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 20:24:20 -05:00

91 lines
2.7 KiB
Go

package provider
import "context"
// Transcriber abstracts a speech-to-text model implementation.
type Transcriber interface {
Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
}
// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat string
const (
TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json"
TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
TranscriptionResponseFormatText TranscriptionResponseFormat = "text"
TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt"
TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt"
)
// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity string
const (
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)
// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions struct {
Language string
Prompt string
Temperature *float64
ResponseFormat TranscriptionResponseFormat
TimestampGranularities []TranscriptionTimestampGranularity
IncludeLogprobs bool
}
// Transcription captures a normalized transcription result.
type Transcription struct {
Provider string
Model string
Text string
Language string
DurationSeconds float64
Segments []TranscriptionSegment
Words []TranscriptionWord
Logprobs []TranscriptionTokenLogprob
Usage TranscriptionUsage
RawJSON string
}
// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment struct {
ID int
Start float64
End float64
Text string
Tokens []int
AvgLogprob *float64
CompressionRatio *float64
NoSpeechProb *float64
Words []TranscriptionWord
}
// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord struct {
Word string
Start float64
End float64
Confidence *float64
}
// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob struct {
Token string
Bytes []float64
Logprob float64
}
// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage struct {
Type string
InputTokens int64
OutputTokens int64
TotalTokens int64
AudioTokens int64
TextTokens int64
Seconds float64
}