package provider import "context" // Transcriber abstracts a speech-to-text model implementation. type Transcriber interface { Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) } // TranscriptionResponseFormat controls the output format requested from a transcriber. type TranscriptionResponseFormat string const ( TranscriptionResponseFormatJSON TranscriptionResponseFormat = "json" TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json" TranscriptionResponseFormatText TranscriptionResponseFormat = "text" TranscriptionResponseFormatSRT TranscriptionResponseFormat = "srt" TranscriptionResponseFormatVTT TranscriptionResponseFormat = "vtt" ) // TranscriptionTimestampGranularity defines the requested timestamp detail. type TranscriptionTimestampGranularity string const ( TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word" TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment" ) // TranscriptionOptions configures transcription behavior. type TranscriptionOptions struct { Language string Prompt string Temperature *float64 ResponseFormat TranscriptionResponseFormat TimestampGranularities []TranscriptionTimestampGranularity IncludeLogprobs bool } // Transcription captures a normalized transcription result. type Transcription struct { Provider string Model string Text string Language string DurationSeconds float64 Segments []TranscriptionSegment Words []TranscriptionWord Logprobs []TranscriptionTokenLogprob Usage TranscriptionUsage RawJSON string } // TranscriptionSegment provides a coarse time-sliced transcription segment. type TranscriptionSegment struct { ID int Start float64 End float64 Text string Tokens []int AvgLogprob *float64 CompressionRatio *float64 NoSpeechProb *float64 Words []TranscriptionWord } // TranscriptionWord provides a word-level timestamp. type TranscriptionWord struct { Word string Start float64 End float64 Confidence *float64 } // TranscriptionTokenLogprob captures token-level log probability details. type TranscriptionTokenLogprob struct { Token string Bytes []float64 Logprob float64 } // TranscriptionUsage captures token or duration usage details. type TranscriptionUsage struct { Type string InputTokens int64 OutputTokens int64 TotalTokens int64 AudioTokens int64 TextTokens int64 Seconds float64 }