Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
91 lines
2.7 KiB
Go
91 lines
2.7 KiB
Go
package provider
|
|
|
|
import "context"
|
|
|
|
// Transcriber abstracts a speech-to-text model implementation.
|
|
type Transcriber interface {
|
|
Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
|
|
}
|
|
|
|
// TranscriptionResponseFormat controls the output format requested from a transcriber.
|
|
type TranscriptionResponseFormat string
|
|
|
|
const (
|
|
TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json"
|
|
TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
|
|
TranscriptionResponseFormatText TranscriptionResponseFormat = "text"
|
|
TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt"
|
|
TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt"
|
|
)
|
|
|
|
// TranscriptionTimestampGranularity defines the requested timestamp detail.
|
|
type TranscriptionTimestampGranularity string
|
|
|
|
const (
|
|
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
|
|
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
|
|
)
|
|
|
|
// TranscriptionOptions configures transcription behavior.
|
|
type TranscriptionOptions struct {
|
|
Language string
|
|
Prompt string
|
|
Temperature *float64
|
|
ResponseFormat TranscriptionResponseFormat
|
|
TimestampGranularities []TranscriptionTimestampGranularity
|
|
IncludeLogprobs bool
|
|
}
|
|
|
|
// Transcription captures a normalized transcription result.
|
|
type Transcription struct {
|
|
Provider string
|
|
Model string
|
|
Text string
|
|
Language string
|
|
DurationSeconds float64
|
|
Segments []TranscriptionSegment
|
|
Words []TranscriptionWord
|
|
Logprobs []TranscriptionTokenLogprob
|
|
Usage TranscriptionUsage
|
|
RawJSON string
|
|
}
|
|
|
|
// TranscriptionSegment provides a coarse time-sliced transcription segment.
|
|
type TranscriptionSegment struct {
|
|
ID int
|
|
Start float64
|
|
End float64
|
|
Text string
|
|
Tokens []int
|
|
AvgLogprob *float64
|
|
CompressionRatio *float64
|
|
NoSpeechProb *float64
|
|
Words []TranscriptionWord
|
|
}
|
|
|
|
// TranscriptionWord provides a word-level timestamp.
|
|
type TranscriptionWord struct {
|
|
Word string
|
|
Start float64
|
|
End float64
|
|
Confidence *float64
|
|
}
|
|
|
|
// TranscriptionTokenLogprob captures token-level log probability details.
|
|
type TranscriptionTokenLogprob struct {
|
|
Token string
|
|
Bytes []float64
|
|
Logprob float64
|
|
}
|
|
|
|
// TranscriptionUsage captures token or duration usage details.
|
|
type TranscriptionUsage struct {
|
|
Type string
|
|
InputTokens int64
|
|
OutputTokens int64
|
|
TotalTokens int64
|
|
AudioTokens int64
|
|
TextTokens int64
|
|
Seconds float64
|
|
}
|