go-llm/v2/provider/transcription.go

package provider

import "context"

// Transcriber abstracts a speech-to-text model implementation.
type Transcriber interface {
	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
}

// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat string

const (
	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
)

// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity string

const (
	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)

// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions struct {
	Language               string
	Prompt                 string
	Temperature            *float64
	ResponseFormat         TranscriptionResponseFormat
	TimestampGranularities []TranscriptionTimestampGranularity
	IncludeLogprobs        bool
}

// Transcription captures a normalized transcription result.
type Transcription struct {
	Provider        string
	Model           string
	Text            string
	Language        string
	DurationSeconds float64
	Segments        []TranscriptionSegment
	Words           []TranscriptionWord
	Logprobs        []TranscriptionTokenLogprob
	Usage           TranscriptionUsage
	RawJSON         string
}

// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment struct {
	ID               int
	Start            float64
	End              float64
	Text             string
	Tokens           []int
	AvgLogprob       *float64
	CompressionRatio *float64
	NoSpeechProb     *float64
	Words            []TranscriptionWord
}

// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord struct {
	Word       string
	Start      float64
	End        float64
	Confidence *float64
}

// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob struct {
	Token   string
	Bytes   []float64
	Logprob float64
}

// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage struct {
	Type         string
	InputTokens  int64
	OutputTokens int64
	TotalTokens  int64
	AudioTokens  int64
	TextTokens   int64
	Seconds      float64
}