Add OpenAI-based transcriber implementation

- Introduce `openaiTranscriber` for integrating OpenAI's Whisper audio transcription capabilities. - Define `Transcriber` interface and associated types (`Transcription`, `TranscriptionOptions`, segments, and words). - Implement transcription logic supporting features like languages, prompts, temperature, and timestamp granularities. - Add `audioFileToWav` utility using `ffmpeg` for audio file conversion to WAV format. - Ensure response parsing for structured and verbose JSON outputs.
2026-01-25 01:46:29 -05:00
parent 9c1b4f7e9f
commit 8801ce5945
2 changed files with 364 additions and 0 deletions
--- a/openai_transcriber.go
+++ b/openai_transcriber.go
@@ -0,0 +1,219 @@
 package llm
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"strings"
 	"github.com/openai/openai-go"
 	"github.com/openai/openai-go/option"
 )
 type openaiTranscriber struct {
 	key     string
 	model   string
 	baseUrl string
 }
 var _ Transcriber = openaiTranscriber{}
 // OpenAITranscriber creates a transcriber backed by OpenAI's audio models.
 // If model is empty, whisper-1 is used by default.
 func OpenAITranscriber(key string, model string) Transcriber {
 	if strings.TrimSpace(model) == "" {
 		model = "whisper-1"
 	}
 	return openaiTranscriber{
 		key:   key,
 		model: model,
 	}
 }
 func (o openaiTranscriber) Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error) {
 	if len(wav) == 0 {
 		return Transcription{}, fmt.Errorf("wav data is empty")
 	}
 	format := opts.ResponseFormat
 	if format == "" {
 		if strings.HasPrefix(o.model, "gpt-4o") {
 			format = TranscriptionResponseFormatJSON
 		} else {
 			format = TranscriptionResponseFormatVerboseJSON
 		}
 	}
 	if format != TranscriptionResponseFormatJSON && format != TranscriptionResponseFormatVerboseJSON {
 		return Transcription{}, fmt.Errorf("openai transcriber requires response_format json or verbose_json for structured output")
 	}
 	if len(opts.TimestampGranularities) > 0 && format != TranscriptionResponseFormatVerboseJSON {
 		return Transcription{}, fmt.Errorf("timestamp granularities require response_format=verbose_json")
 	}
 	params := openai.AudioTranscriptionNewParams{
 		File:  bytes.NewReader(wav),
 		Model: openai.AudioModel(o.model),
 	}
 	if opts.Language != "" {
 		params.Language = openai.String(opts.Language)
 	}
 	if opts.Prompt != "" {
 		params.Prompt = openai.String(opts.Prompt)
 	}
 	if opts.Temperature != nil {
 		params.Temperature = openai.Float(*opts.Temperature)
 	}
 	params.ResponseFormat = openai.AudioResponseFormat(format)
 	if opts.IncludeLogprobs {
 		params.Include = []openai.TranscriptionInclude{openai.TranscriptionIncludeLogprobs}
 	}
 	if len(opts.TimestampGranularities) > 0 {
 		for _, granularity := range opts.TimestampGranularities {
 			params.TimestampGranularities = append(params.TimestampGranularities, string(granularity))
 		}
 	}
 	clientOptions := []option.RequestOption{
 		option.WithAPIKey(o.key),
 	}
 	if o.baseUrl != "" {
 		clientOptions = append(clientOptions, option.WithBaseURL(o.baseUrl))
 	}
 	client := openai.NewClient(clientOptions...)
 	resp, err := client.Audio.Transcriptions.New(ctx, params)
 	if err != nil {
 		return Transcription{}, fmt.Errorf("openai transcription failed: %w", err)
 	}
 	return openaiTranscriptionToResult(o.model, resp), nil
 }
 type openaiVerboseTranscription struct {
 	Text     string                 `json:"text"`
 	Language string                 `json:"language"`
 	Duration float64                `json:"duration"`
 	Segments []openaiVerboseSegment `json:"segments"`
 	Words    []openaiVerboseWord    `json:"words"`
 }
 type openaiVerboseSegment struct {
 	ID               int                 `json:"id"`
 	Start            float64             `json:"start"`
 	End              float64             `json:"end"`
 	Text             string              `json:"text"`
 	Tokens           []int               `json:"tokens"`
 	AvgLogprob       *float64            `json:"avg_logprob"`
 	CompressionRatio *float64            `json:"compression_ratio"`
 	NoSpeechProb     *float64            `json:"no_speech_prob"`
 	Words            []openaiVerboseWord `json:"words"`
 }
 type openaiVerboseWord struct {
 	Word  string  `json:"word"`
 	Start float64 `json:"start"`
 	End   float64 `json:"end"`
 }
 func openaiTranscriptionToResult(model string, resp *openai.Transcription) Transcription {
 	result := Transcription{
 		Provider: "openai",
 		Model:    model,
 	}
 	if resp == nil {
 		return result
 	}
 	result.Text = resp.Text
 	result.RawJSON = resp.RawJSON()
 	for _, logprob := range resp.Logprobs {
 		result.Logprobs = append(result.Logprobs, TranscriptionTokenLogprob{
 			Token:   logprob.Token,
 			Bytes:   logprob.Bytes,
 			Logprob: logprob.Logprob,
 		})
 	}
 	if usage := openaiUsageToTranscriptionUsage(resp.Usage); usage.Type != "" {
 		result.Usage = usage
 	}
 	if result.RawJSON == "" {
 		return result
 	}
 	var verbose openaiVerboseTranscription
 	if err := json.Unmarshal([]byte(result.RawJSON), &verbose); err != nil {
 		return result
 	}
 	if verbose.Text != "" {
 		result.Text = verbose.Text
 	}
 	result.Language = verbose.Language
 	result.DurationSeconds = verbose.Duration
 	for _, seg := range verbose.Segments {
 		segment := TranscriptionSegment{
 			ID:               seg.ID,
 			Start:            seg.Start,
 			End:              seg.End,
 			Text:             seg.Text,
 			Tokens:           append([]int(nil), seg.Tokens...),
 			AvgLogprob:       seg.AvgLogprob,
 			CompressionRatio: seg.CompressionRatio,
 			NoSpeechProb:     seg.NoSpeechProb,
 		}
 		for _, word := range seg.Words {
 			segment.Words = append(segment.Words, TranscriptionWord{
 				Word:  word.Word,
 				Start: word.Start,
 				End:   word.End,
 			})
 		}
 		result.Segments = append(result.Segments, segment)
 	}
 	for _, word := range verbose.Words {
 		result.Words = append(result.Words, TranscriptionWord{
 			Word:  word.Word,
 			Start: word.Start,
 			End:   word.End,
 		})
 	}
 	return result
 }
 func openaiUsageToTranscriptionUsage(usage openai.TranscriptionUsageUnion) TranscriptionUsage {
 	switch usage.Type {
 	case "tokens":
 		tokens := usage.AsTokens()
 		return TranscriptionUsage{
 			Type:         usage.Type,
 			InputTokens:  tokens.InputTokens,
 			OutputTokens: tokens.OutputTokens,
 			TotalTokens:  tokens.TotalTokens,
 			AudioTokens:  tokens.InputTokenDetails.AudioTokens,
 			TextTokens:   tokens.InputTokenDetails.TextTokens,
 		}
 	case "duration":
 		duration := usage.AsDuration()
 		return TranscriptionUsage{
 			Type:    usage.Type,
 			Seconds: duration.Seconds,
 		}
 	default:
 		return TranscriptionUsage{}
 	}
 }
--- a/transcriber.go
+++ b/transcriber.go
@@ -0,0 +1,145 @@
 package llm
 import (
 	"context"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 )
 // Transcriber abstracts a speech-to-text model implementation.
 type Transcriber interface {
 	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
 }
 // TranscriptionResponseFormat controls the output format requested from a transcriber.
 type TranscriptionResponseFormat string
 const (
 	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
 	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
 	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
 	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
 	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
 )
 // TranscriptionTimestampGranularity defines the requested timestamp detail.
 type TranscriptionTimestampGranularity string
 const (
 	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
 	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
 )
 // TranscriptionOptions configures transcription behavior.
 type TranscriptionOptions struct {
 	Language               string
 	Prompt                 string
 	Temperature            *float64
 	ResponseFormat         TranscriptionResponseFormat
 	TimestampGranularities []TranscriptionTimestampGranularity
 	IncludeLogprobs        bool
 }
 // Transcription captures a normalized transcription result.
 type Transcription struct {
 	Provider        string
 	Model           string
 	Text            string
 	Language        string
 	DurationSeconds float64
 	Segments        []TranscriptionSegment
 	Words           []TranscriptionWord
 	Logprobs        []TranscriptionTokenLogprob
 	Usage           TranscriptionUsage
 	RawJSON         string
 }
 // TranscriptionSegment provides a coarse time-sliced transcription segment.
 type TranscriptionSegment struct {
 	ID               int
 	Start            float64
 	End              float64
 	Text             string
 	Tokens           []int
 	AvgLogprob       *float64
 	CompressionRatio *float64
 	NoSpeechProb     *float64
 	Words            []TranscriptionWord
 }
 // TranscriptionWord provides a word-level timestamp.
 type TranscriptionWord struct {
 	Word       string
 	Start      float64
 	End        float64
 	Confidence *float64
 }
 // TranscriptionTokenLogprob captures token-level log probability details.
 type TranscriptionTokenLogprob struct {
 	Token   string
 	Bytes   []float64
 	Logprob float64
 }
 // TranscriptionUsage captures token or duration usage details.
 type TranscriptionUsage struct {
 	Type         string
 	InputTokens  int64
 	OutputTokens int64
 	TotalTokens  int64
 	AudioTokens  int64
 	TextTokens   int64
 	Seconds      float64
 }
 // TranscribeFile converts an audio file to WAV and transcribes it.
 func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
 	if transcriber == nil {
 		return Transcription{}, fmt.Errorf("transcriber is nil")
 	}
 	wav, err := audioFileToWav(ctx, filename)
 	if err != nil {
 		return Transcription{}, err
 	}
 	return transcriber.Transcribe(ctx, wav, opts)
 }
 func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
 	if filename == "" {
 		return nil, fmt.Errorf("filename is empty")
 	}
 	if strings.EqualFold(filepath.Ext(filename), ".wav") {
 		data, err := os.ReadFile(filename)
 		if err != nil {
 			return nil, fmt.Errorf("read wav file: %w", err)
 		}
 		return data, nil
 	}
 	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
 	if err != nil {
 		return nil, fmt.Errorf("create temp wav file: %w", err)
 	}
 	tempPath := tempFile.Name()
 	_ = tempFile.Close()
 	defer os.Remove(tempPath)
 	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
 	if output, err := cmd.CombinedOutput(); err != nil {
 		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
 	}
 	data, err := os.ReadFile(tempPath)
 	if err != nil {
 		return nil, fmt.Errorf("read converted wav file: %w", err)
 	}
 	return data, nil
 }