go-llm/transcriber.go

package llm

import (
	"context"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
)

// Transcriber abstracts a speech-to-text model implementation.
type Transcriber interface {
	Transcribe(ctx context.Context, wav []byte, opts TranscriptionOptions) (Transcription, error)
}

// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat string

const (
	TranscriptionResponseFormatJSON        TranscriptionResponseFormat = "json"
	TranscriptionResponseFormatVerboseJSON TranscriptionResponseFormat = "verbose_json"
	TranscriptionResponseFormatText        TranscriptionResponseFormat = "text"
	TranscriptionResponseFormatSRT         TranscriptionResponseFormat = "srt"
	TranscriptionResponseFormatVTT         TranscriptionResponseFormat = "vtt"
)

// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity string

const (
	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)

// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions struct {
	Language               string
	Prompt                 string
	Temperature            *float64
	ResponseFormat         TranscriptionResponseFormat
	TimestampGranularities []TranscriptionTimestampGranularity
	IncludeLogprobs        bool
}

// Transcription captures a normalized transcription result.
type Transcription struct {
	Provider        string
	Model           string
	Text            string
	Language        string
	DurationSeconds float64
	Segments        []TranscriptionSegment
	Words           []TranscriptionWord
	Logprobs        []TranscriptionTokenLogprob
	Usage           TranscriptionUsage
	RawJSON         string
}

// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment struct {
	ID               int
	Start            float64
	End              float64
	Text             string
	Tokens           []int
	AvgLogprob       *float64
	CompressionRatio *float64
	NoSpeechProb     *float64
	Words            []TranscriptionWord
}

// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord struct {
	Word       string
	Start      float64
	End        float64
	Confidence *float64
}

// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob struct {
	Token   string
	Bytes   []float64
	Logprob float64
}

// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage struct {
	Type         string
	InputTokens  int64
	OutputTokens int64
	TotalTokens  int64
	AudioTokens  int64
	TextTokens   int64
	Seconds      float64
}

// TranscribeFile converts an audio file to WAV and transcribes it.
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
	if transcriber == nil {
		return Transcription{}, fmt.Errorf("transcriber is nil")
	}

	wav, err := audioFileToWav(ctx, filename)
	if err != nil {
		return Transcription{}, err
	}

	return transcriber.Transcribe(ctx, wav, opts)
}

func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
	if filename == "" {
		return nil, fmt.Errorf("filename is empty")
	}

	if strings.EqualFold(filepath.Ext(filename), ".wav") {
		data, err := os.ReadFile(filename)
		if err != nil {
			return nil, fmt.Errorf("read wav file: %w", err)
		}
		return data, nil
	}

	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
	if err != nil {
		return nil, fmt.Errorf("create temp wav file: %w", err)
	}
	tempPath := tempFile.Name()
	_ = tempFile.Close()
	defer os.Remove(tempPath)

	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
	if output, err := cmd.CombinedOutput(); err != nil {
		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
	}

	data, err := os.ReadFile(tempPath)
	if err != nil {
		return nil, fmt.Errorf("read converted wav file: %w", err)
	}

	return data, nil
}