Add transcription API to v2 module

Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 20:24:20 -05:00
parent 9d6d2c61c3
commit 9e288954f2
3 changed files with 420 additions and 0 deletions
--- a/v2/openai/transcriber.go
+++ b/v2/openai/transcriber.go
@@ -0,0 +1,230 @@
+package openai
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/openai/openai-go"
+	"github.com/openai/openai-go/option"
+
+	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
+)
+
+// Transcriber implements the provider.Transcriber interface using OpenAI's audio models.
+type Transcriber struct {
+	key     string
+	model   string
+	baseURL string
+}
+
+var _ provider.Transcriber = (*Transcriber)(nil)
+
+// NewTranscriber creates a transcriber backed by OpenAI's audio models.
+// If model is empty, "whisper-1" is used by default.
+func NewTranscriber(key string, model string) *Transcriber {
+	if strings.TrimSpace(model) == "" {
+		model = "whisper-1"
+	}
+	return &Transcriber{
+		key:   key,
+		model: model,
+	}
+}
+
+// NewTranscriberWithBaseURL creates a transcriber with a custom API base URL.
+func NewTranscriberWithBaseURL(key, model, baseURL string) *Transcriber {
+	t := NewTranscriber(key, model)
+	t.baseURL = baseURL
+	return t
+}
+
+// Transcribe performs speech-to-text transcription of WAV audio data.
+func (t *Transcriber) Transcribe(ctx context.Context, wav []byte, opts provider.TranscriptionOptions) (provider.Transcription, error) {
+	if len(wav) == 0 {
+		return provider.Transcription{}, fmt.Errorf("wav data is empty")
+	}
+
+	format := opts.ResponseFormat
+	if format == "" {
+		if strings.HasPrefix(t.model, "gpt-4o") {
+			format = provider.TranscriptionResponseFormatJSON
+		} else {
+			format = provider.TranscriptionResponseFormatVerboseJSON
+		}
+	}
+
+	if format != provider.TranscriptionResponseFormatJSON && format != provider.TranscriptionResponseFormatVerboseJSON {
+		return provider.Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output")
+	}
+
+	if len(opts.TimestampGranularities) > 0 && format != provider.TranscriptionResponseFormatVerboseJSON {
+		return provider.Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json")
+	}
+
+	params := openai.AudioTranscriptionNewParams{
+		File:  openai.File(bytes.NewReader(wav), "audio.wav", "audio/wav"),
+		Model: openai.AudioModel(t.model),
+	}
+
+	if opts.Language != "" {
+		params.Language = openai.String(opts.Language)
+	}
+	if opts.Prompt != "" {
+		params.Prompt = openai.String(opts.Prompt)
+	}
+	if opts.Temperature != nil {
+		params.Temperature = openai.Float(*opts.Temperature)
+	}
+
+	params.ResponseFormat = openai.AudioResponseFormat(format)
+
+	if opts.IncludeLogprobs {
+		params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs}
+	}
+
+	if len(opts.TimestampGranularities) > 0 {
+		for _, granularity := range opts.TimestampGranularities {
+			params.TimestampGranularities = append(params.TimestampGranularities, string(granularity))
+		}
+	}
+
+	clientOptions := []option.RequestOption{
+		option.WithAPIKey(t.key),
+	}
+	if t.baseURL != "" {
+		clientOptions = append(clientOptions, option.WithBaseURL(t.baseURL))
+	}
+
+	client := openai.NewClient(clientOptions...)
+	resp, err := client.Audio.Transcriptions.New(ctx, params)
+	if err != nil {
+		return provider.Transcription{}, fmt.Errorf("openai transcription failed: %w", err)
+	}
+
+	return transcriptionToResult(t.model, resp), nil
+}
+
+type verboseTranscription struct {
+	Text     string           `json:"text"`
+	Language string           `json:"language"`
+	Duration float64          `json:"duration"`
+	Segments []verboseSegment `json:"segments"`
+	Words    []verboseWord    `json:"words"`
+}
+
+type verboseSegment struct {
+	ID               int           `json:"id"`
+	Start            float64       `json:"start"`
+	End              float64       `json:"end"`
+	Text             string        `json:"text"`
+	Tokens           []int         `json:"tokens"`
+	AvgLogprob       *float64      `json:"avg_logprob"`
+	CompressionRatio *float64      `json:"compression_ratio"`
+	NoSpeechProb     *float64      `json:"no_speech_prob"`
+	Words            []verboseWord `json:"words"`
+}
+
+type verboseWord struct {
+	Word  string  `json:"word"`
+	Start float64 `json:"start"`
+	End   float64 `json:"end"`
+}
+
+func transcriptionToResult(model string, resp *openai.Transcription) provider.Transcription {
+	result := provider.Transcription{
+		Provider: "openai",
+		Model:    model,
+	}
+	if resp == nil {
+		return result
+	}
+
+	result.Text = resp.Text
+	result.RawJSON = resp.RawJSON()
+
+	for _, logprob := range resp.Logprobs {
+		result.Logprobs = append(result.Logprobs, provider.TranscriptionTokenLogprob{
+			Token:   logprob.Token,
+			Bytes:   logprob.Bytes,
+			Logprob: logprob.Logprob,
+		})
+	}
+
+	if usage := usageToTranscriptionUsage(resp.Usage); usage.Type != "" {
+		result.Usage = usage
+	}
+
+	if result.RawJSON == "" {
+		return result
+	}
+
+	var verbose verboseTranscription
+	if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil {
+		return result
+	}
+
+	if verbose.Text != "" {
+		result.Text = verbose.Text
+	}
+	result.Language = verbose.Language
+	result.DurationSeconds = verbose.Duration
+
+	for _, seg := range verbose.Segments {
+		segment := provider.TranscriptionSegment{
+			ID:               seg.ID,
+			Start:            seg.Start,
+			End:              seg.End,
+			Text:             seg.Text,
+			Tokens:           append([]int(nil), seg.Tokens...),
+			AvgLogprob:       seg.AvgLogprob,
+			CompressionRatio: seg.CompressionRatio,
+			NoSpeechProb:     seg.NoSpeechProb,
+		}
+
+		for _, word := range seg.Words {
+			segment.Words = append(segment.Words, provider.TranscriptionWord{
+				Word:  word.Word,
+				Start: word.Start,
+				End:   word.End,
+			})
+		}
+
+		result.Segments = append(result.Segments, segment)
+	}
+
+	for _, word := range verbose.Words {
+		result.Words = append(result.Words, provider.TranscriptionWord{
+			Word:  word.Word,
+			Start: word.Start,
+			End:   word.End,
+		})
+	}
+
+	return result
+}
+
+func usageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) provider.TranscriptionUsage {
+	switch usage.Type {
+	case "tokens":
+		tokens := usage.AsTokens()
+		return provider.TranscriptionUsage{
+			Type:         usage.Type,
+			InputTokens:  tokens.InputTokens,
+			OutputTokens: tokens.OutputTokens,
+			TotalTokens:  tokens.TotalTokens,
+			AudioTokens:  tokens.InputTokenDetails.AudioTokens,
+			TextTokens:   tokens.InputTokenDetails.TextTokens,
+		}
+	case "duration":
+		duration := usage.AsDuration()
+		return provider.TranscriptionUsage{
+			Type:    usage.Type,
+			Seconds: duration.Seconds,
+		}
+	default:
+		return provider.TranscriptionUsage{}
+	}
+}
--- a/v2/provider/transcription.go
+++ b/v2/provider/transcription.go
@@ -0,0 +1,90 @@
+package provider
+
+import "context"
+
+// Transcriber abstracts a speech-to-text model implementation.
+type Transcriber interface {
+	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
+}
+
+// TranscriptionResponseFormat controls the output format requested from a transcriber.
+type TranscriptionResponseFormat string
+
+const (
+	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
+	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
+	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
+	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
+	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
+)
+
+// TranscriptionTimestampGranularity defines the requested timestamp detail.
+type TranscriptionTimestampGranularity string
+
+const (
+	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
+	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
+)
+
+// TranscriptionOptions configures transcription behavior.
+type TranscriptionOptions struct {
+	Language               string
+	Prompt                 string
+	Temperature            *float64
+	ResponseFormat         TranscriptionResponseFormat
+	TimestampGranularities []TranscriptionTimestampGranularity
+	IncludeLogprobs        bool
+}
+
+// Transcription captures a normalized transcription result.
+type Transcription struct {
+	Provider        string
+	Model           string
+	Text            string
+	Language        string
+	DurationSeconds float64
+	Segments        []TranscriptionSegment
+	Words           []TranscriptionWord
+	Logprobs        []TranscriptionTokenLogprob
+	Usage           TranscriptionUsage
+	RawJSON         string
+}
+
+// TranscriptionSegment provides a coarse time-sliced transcription segment.
+type TranscriptionSegment struct {
+	ID               int
+	Start            float64
+	End              float64
+	Text             string
+	Tokens           []int
+	AvgLogprob       *float64
+	CompressionRatio *float64
+	NoSpeechProb     *float64
+	Words            []TranscriptionWord
+}
+
+// TranscriptionWord provides a word-level timestamp.
+type TranscriptionWord struct {
+	Word       string
+	Start      float64
+	End        float64
+	Confidence *float64
+}
+
+// TranscriptionTokenLogprob captures token-level log probability details.
+type TranscriptionTokenLogprob struct {
+	Token   string
+	Bytes   []float64
+	Logprob float64
+}
+
+// TranscriptionUsage captures token or duration usage details.
+type TranscriptionUsage struct {
+	Type         string
+	InputTokens  int64
+	OutputTokens int64
+	TotalTokens  int64
+	AudioTokens  int64
+	TextTokens   int64
+	Seconds      float64
+}
--- a/v2/transcriber.go
+++ b/v2/transcriber.go
@@ -0,0 +1,100 @@
+package llm
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+
+	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
+)
+
+// Transcriber abstracts a speech-to-text model implementation.
+type Transcriber = provider.Transcriber
+
+// TranscriptionResponseFormat controls the output format requested from a transcriber.
+type TranscriptionResponseFormat = provider.TranscriptionResponseFormat
+
+const (
+	TranscriptionResponseFormatJSON        = provider.TranscriptionResponseFormatJSON
+	TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON
+	TranscriptionResponseFormatText        = provider.TranscriptionResponseFormatText
+	TranscriptionResponseFormatSRT         = provider.TranscriptionResponseFormatSRT
+	TranscriptionResponseFormatVTT         = provider.TranscriptionResponseFormatVTT
+)
+
+// TranscriptionTimestampGranularity defines the requested timestamp detail.
+type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity
+
+const (
+	TranscriptionTimestampGranularityWord    = provider.TranscriptionTimestampGranularityWord
+	TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment
+)
+
+// TranscriptionOptions configures transcription behavior.
+type TranscriptionOptions = provider.TranscriptionOptions
+
+// Transcription captures a normalized transcription result.
+type Transcription = provider.Transcription
+
+// TranscriptionSegment provides a coarse time-sliced transcription segment.
+type TranscriptionSegment = provider.TranscriptionSegment
+
+// TranscriptionWord provides a word-level timestamp.
+type TranscriptionWord = provider.TranscriptionWord
+
+// TranscriptionTokenLogprob captures token-level log probability details.
+type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob
+
+// TranscriptionUsage captures token or duration usage details.
+type TranscriptionUsage = provider.TranscriptionUsage
+
+// TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it.
+func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
+	if transcriber == nil {
+		return Transcription{}, fmt.Errorf("transcriber is nil")
+	}
+
+	wav, err := audioFileToWav(ctx, filename)
+	if err != nil {
+		return Transcription{}, err
+	}
+
+	return transcriber.Transcribe(ctx, wav, opts)
+}
+
+func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
+	if filename == "" {
+		return nil, fmt.Errorf("filename is empty")
+	}
+
+	if strings.EqualFold(filepath.Ext(filename), ".wav") {
+		data, err := os.ReadFile(filename)
+		if err != nil {
+			return nil, fmt.Errorf("read wav file: %w", err)
+		}
+		return data, nil
+	}
+
+	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
+	if err != nil {
+		return nil, fmt.Errorf("create temp wav file: %w", err)
+	}
+	tempPath := tempFile.Name()
+	_ = tempFile.Close()
+	defer os.Remove(tempPath)
+
+	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
+	}
+
+	data, err := os.ReadFile(tempPath)
+	if err != nil {
+		return nil, fmt.Errorf("read converted wav file: %w", err)
+	}
+
+	return data, nil
+}