Add transcription API to v2 module

Migrate speech-to-text transcription types and OpenAI transcriber implementation from v1. Types are defined in provider/ to avoid import cycles and re-exported via type aliases from the root package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 20:24:20 -05:00
parent 9d6d2c61c3
commit 9e288954f2
3 changed files with 420 additions and 0 deletions
@@ -0,0 +1,230 @@
 package openai
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"strings"
 	"github.com/openai/openai-go"
 	"github.com/openai/openai-go/option"
 	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
 )
 // Transcriber implements the provider.Transcriber interface using OpenAI's audio models.
 type Transcriber struct {
 	key     string
 	model   string
 	baseURL string
 }
 var _ provider.Transcriber = (*Transcriber)(nil)
 // NewTranscriber creates a transcriber backed by OpenAI's audio models.
 // If model is empty, "whisper-1" is used by default.
 func NewTranscriber(key string, model string) *Transcriber {
 	if strings.TrimSpace(model) == "" {
 		model = "whisper-1"
 	}
 	return &Transcriber{
 		key:   key,
 		model: model,
 	}
 }
 // NewTranscriberWithBaseURL creates a transcriber with a custom API base URL.
 func NewTranscriberWithBaseURL(key, model, baseURL string) *Transcriber {
 	t := NewTranscriber(key, model)
 	t.baseURL = baseURL
 	return t
 }
 // Transcribe performs speech-to-text transcription of WAV audio data.
 func (t *Transcriber) Transcribe(ctx context.Context, wav []byte, opts provider.TranscriptionOptions) (provider.Transcription, error) {
 	if len(wav) == 0 {
 		return provider.Transcription{}, fmt.Errorf("wav data is empty")
 	}
 	format := opts.ResponseFormat
 	if format == "" {
 		if strings.HasPrefix(t.model, "gpt-4o") {
 			format = provider.TranscriptionResponseFormatJSON
 		} else {
 			format = provider.TranscriptionResponseFormatVerboseJSON
 		}
 	}
 	if format != provider.TranscriptionResponseFormatJSON && format != provider.TranscriptionResponseFormatVerboseJSON {
 		return provider.Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output")
 	}
 	if len(opts.TimestampGranularities) > 0 && format != provider.TranscriptionResponseFormatVerboseJSON {
 		return provider.Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json")
 	}
 	params := openai.AudioTranscriptionNewParams{
 		File:  openai.File(bytes.NewReader(wav), "audio.wav", "audio/wav"),
 		Model: openai.AudioModel(t.model),
 	}
 	if opts.Language != "" {
 		params.Language = openai.String(opts.Language)
 	}
 	if opts.Prompt != "" {
 		params.Prompt = openai.String(opts.Prompt)
 	}
 	if opts.Temperature != nil {
 		params.Temperature = openai.Float(*opts.Temperature)
 	}
 	params.ResponseFormat = openai.AudioResponseFormat(format)
 	if opts.IncludeLogprobs {
 		params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs}
 	}
 	if len(opts.TimestampGranularities) > 0 {
 		for _, granularity := range opts.TimestampGranularities {
 			params.TimestampGranularities = append(params.TimestampGranularities, string(granularity))
 		}
 	}
 	clientOptions := []option.RequestOption{
 		option.WithAPIKey(t.key),
 	}
 	if t.baseURL != "" {
 		clientOptions = append(clientOptions, option.WithBaseURL(t.baseURL))
 	}
 	client := openai.NewClient(clientOptions...)
 	resp, err := client.Audio.Transcriptions.New(ctx, params)
 	if err != nil {
 		return provider.Transcription{}, fmt.Errorf("openai transcription failed: %w", err)
 	}
 	return transcriptionToResult(t.model, resp), nil
 }
 type verboseTranscription struct {
 	Text     string           `json:"text"`
 	Language string           `json:"language"`
 	Duration float64          `json:"duration"`
 	Segments []verboseSegment `json:"segments"`
 	Words    []verboseWord    `json:"words"`
 }
 type verboseSegment struct {
 	ID               int           `json:"id"`
 	Start            float64       `json:"start"`
 	End              float64       `json:"end"`
 	Text             string        `json:"text"`
 	Tokens           []int         `json:"tokens"`
 	AvgLogprob       *float64      `json:"avg_logprob"`
 	CompressionRatio *float64      `json:"compression_ratio"`
 	NoSpeechProb     *float64      `json:"no_speech_prob"`
 	Words            []verboseWord `json:"words"`
 }
 type verboseWord struct {
 	Word  string  `json:"word"`
 	Start float64 `json:"start"`
 	End   float64 `json:"end"`
 }
 func transcriptionToResult(model string, resp *openai.Transcription) provider.Transcription {
 	result := provider.Transcription{
 		Provider: "openai",
 		Model:    model,
 	}
 	if resp == nil {
 		return result
 	}
 	result.Text = resp.Text
 	result.RawJSON = resp.RawJSON()
 	for _, logprob := range resp.Logprobs {
 		result.Logprobs = append(result.Logprobs, provider.TranscriptionTokenLogprob{
 			Token:   logprob.Token,
 			Bytes:   logprob.Bytes,
 			Logprob: logprob.Logprob,
 		})
 	}
 	if usage := usageToTranscriptionUsage(resp.Usage); usage.Type != "" {
 		result.Usage = usage
 	}
 	if result.RawJSON == "" {
 		return result
 	}
 	var verbose verboseTranscription
 	if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil {
 		return result
 	}
 	if verbose.Text != "" {
 		result.Text = verbose.Text
 	}
 	result.Language = verbose.Language
 	result.DurationSeconds = verbose.Duration
 	for _, seg := range verbose.Segments {
 		segment := provider.TranscriptionSegment{
 			ID:               seg.ID,
 			Start:            seg.Start,
 			End:              seg.End,
 			Text:             seg.Text,
 			Tokens:           append([]int(nil), seg.Tokens...),
 			AvgLogprob:       seg.AvgLogprob,
 			CompressionRatio: seg.CompressionRatio,
 			NoSpeechProb:     seg.NoSpeechProb,
 		}
 		for _, word := range seg.Words {
 			segment.Words = append(segment.Words, provider.TranscriptionWord{
 				Word:  word.Word,
 				Start: word.Start,
 				End:   word.End,
 			})
 		}
 		result.Segments = append(result.Segments, segment)
 	}
 	for _, word := range verbose.Words {
 		result.Words = append(result.Words, provider.TranscriptionWord{
 			Word:  word.Word,
 			Start: word.Start,
 			End:   word.End,
 		})
 	}
 	return result
 }
 func usageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) provider.TranscriptionUsage {
 	switch usage.Type {
 	case "tokens":
 		tokens := usage.AsTokens()
 		return provider.TranscriptionUsage{
 			Type:         usage.Type,
 			InputTokens:  tokens.InputTokens,
 			OutputTokens: tokens.OutputTokens,
 			TotalTokens:  tokens.TotalTokens,
 			AudioTokens:  tokens.InputTokenDetails.AudioTokens,
 			TextTokens:   tokens.InputTokenDetails.TextTokens,
 		}
 	case "duration":
 		duration := usage.AsDuration()
 		return provider.TranscriptionUsage{
 			Type:    usage.Type,
 			Seconds: duration.Seconds,
 		}
 	default:
 		return provider.TranscriptionUsage{}
 	}
 }
@@ -0,0 +1,90 @@
 package provider
 import "context"
 // Transcriber abstracts a speech-to-text model implementation.
 type Transcriber interface {
 	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
 }
 // TranscriptionResponseFormat controls the output format requested from a transcriber.
 type TranscriptionResponseFormat string
 const (
 	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
 	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
 	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
 	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
 	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
 )
 // TranscriptionTimestampGranularity defines the requested timestamp detail.
 type TranscriptionTimestampGranularity string
 const (
 	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
 	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
 )
 // TranscriptionOptions configures transcription behavior.
 type TranscriptionOptions struct {
 	Language               string
 	Prompt                 string
 	Temperature            *float64
 	ResponseFormat         TranscriptionResponseFormat
 	TimestampGranularities []TranscriptionTimestampGranularity
 	IncludeLogprobs        bool
 }
 // Transcription captures a normalized transcription result.
 type Transcription struct {
 	Provider        string
 	Model           string
 	Text            string
 	Language        string
 	DurationSeconds float64
 	Segments        []TranscriptionSegment
 	Words           []TranscriptionWord
 	Logprobs        []TranscriptionTokenLogprob
 	Usage           TranscriptionUsage
 	RawJSON         string
 }
 // TranscriptionSegment provides a coarse time-sliced transcription segment.
 type TranscriptionSegment struct {
 	ID               int
 	Start            float64
 	End              float64
 	Text             string
 	Tokens           []int
 	AvgLogprob       *float64
 	CompressionRatio *float64
 	NoSpeechProb     *float64
 	Words            []TranscriptionWord
 }
 // TranscriptionWord provides a word-level timestamp.
 type TranscriptionWord struct {
 	Word       string
 	Start      float64
 	End        float64
 	Confidence *float64
 }
 // TranscriptionTokenLogprob captures token-level log probability details.
 type TranscriptionTokenLogprob struct {
 	Token   string
 	Bytes   []float64
 	Logprob float64
 }
 // TranscriptionUsage captures token or duration usage details.
 type TranscriptionUsage struct {
 	Type         string
 	InputTokens  int64
 	OutputTokens int64
 	TotalTokens  int64
 	AudioTokens  int64
 	TextTokens   int64
 	Seconds      float64
 }
@@ -0,0 +1,100 @@
 package llm
 import (
 	"context"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
 )
 // Transcriber abstracts a speech-to-text model implementation.
 type Transcriber = provider.Transcriber
 // TranscriptionResponseFormat controls the output format requested from a transcriber.
 type TranscriptionResponseFormat = provider.TranscriptionResponseFormat
 const (
 	TranscriptionResponseFormatJSON        = provider.TranscriptionResponseFormatJSON
 	TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON
 	TranscriptionResponseFormatText        = provider.TranscriptionResponseFormatText
 	TranscriptionResponseFormatSRT         = provider.TranscriptionResponseFormatSRT
 	TranscriptionResponseFormatVTT         = provider.TranscriptionResponseFormatVTT
 )
 // TranscriptionTimestampGranularity defines the requested timestamp detail.
 type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity
 const (
 	TranscriptionTimestampGranularityWord    = provider.TranscriptionTimestampGranularityWord
 	TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment
 )
 // TranscriptionOptions configures transcription behavior.
 type TranscriptionOptions = provider.TranscriptionOptions
 // Transcription captures a normalized transcription result.
 type Transcription = provider.Transcription
 // TranscriptionSegment provides a coarse time-sliced transcription segment.
 type TranscriptionSegment = provider.TranscriptionSegment
 // TranscriptionWord provides a word-level timestamp.
 type TranscriptionWord = provider.TranscriptionWord
 // TranscriptionTokenLogprob captures token-level log probability details.
 type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob
 // TranscriptionUsage captures token or duration usage details.
 type TranscriptionUsage = provider.TranscriptionUsage
 // TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it.
 func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
 	if transcriber == nil {
 		return Transcription{}, fmt.Errorf("transcriber is nil")
 	}
 	wav, err := audioFileToWav(ctx, filename)
 	if err != nil {
 		return Transcription{}, err
 	}
 	return transcriber.Transcribe(ctx, wav, opts)
 }
 func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
 	if filename == "" {
 		return nil, fmt.Errorf("filename is empty")
 	}
 	if strings.EqualFold(filepath.Ext(filename), ".wav") {
 		data, err := os.ReadFile(filename)
 		if err != nil {
 			return nil, fmt.Errorf("read wav file: %w", err)
 		}
 		return data, nil
 	}
 	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
 	if err != nil {
 		return nil, fmt.Errorf("create temp wav file: %w", err)
 	}
 	tempPath := tempFile.Name()
 	_ = tempFile.Close()
 	defer os.Remove(tempPath)
 	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
 	if output, err := cmd.CombinedOutput(); err != nil {
 		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
 	}
 	data, err := os.ReadFile(tempPath)
 	if err != nil {
 		return nil, fmt.Errorf("read converted wav file: %w", err)
 	}
 	return data, nil
 }