go-llm/v2/transcriber.go

package llm

import (
	"context"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"strings"

	"gitea.stevedudenhoeffer.com/steve/go-llm/v2/provider"
)

// Transcriber abstracts a speech-to-text model implementation.
type Transcriber = provider.Transcriber

// TranscriptionResponseFormat controls the output format requested from a transcriber.
type TranscriptionResponseFormat = provider.TranscriptionResponseFormat

const (
	TranscriptionResponseFormatJSON        = provider.TranscriptionResponseFormatJSON
	TranscriptionResponseFormatVerboseJSON = provider.TranscriptionResponseFormatVerboseJSON
	TranscriptionResponseFormatText        = provider.TranscriptionResponseFormatText
	TranscriptionResponseFormatSRT         = provider.TranscriptionResponseFormatSRT
	TranscriptionResponseFormatVTT         = provider.TranscriptionResponseFormatVTT
)

// TranscriptionTimestampGranularity defines the requested timestamp detail.
type TranscriptionTimestampGranularity = provider.TranscriptionTimestampGranularity

const (
	TranscriptionTimestampGranularityWord    = provider.TranscriptionTimestampGranularityWord
	TranscriptionTimestampGranularitySegment = provider.TranscriptionTimestampGranularitySegment
)

// TranscriptionOptions configures transcription behavior.
type TranscriptionOptions = provider.TranscriptionOptions

// Transcription captures a normalized transcription result.
type Transcription = provider.Transcription

// TranscriptionSegment provides a coarse time-sliced transcription segment.
type TranscriptionSegment = provider.TranscriptionSegment

// TranscriptionWord provides a word-level timestamp.
type TranscriptionWord = provider.TranscriptionWord

// TranscriptionTokenLogprob captures token-level log probability details.
type TranscriptionTokenLogprob = provider.TranscriptionTokenLogprob

// TranscriptionUsage captures token or duration usage details.
type TranscriptionUsage = provider.TranscriptionUsage

// TranscribeFile converts an audio file to WAV (via ffmpeg) and transcribes it.
func TranscribeFile(ctx context.Context, filename string, transcriber Transcriber, opts TranscriptionOptions) (Transcription, error) {
	if transcriber == nil {
		return Transcription{}, fmt.Errorf("transcriber is nil")
	}

	wav, err := audioFileToWav(ctx, filename)
	if err != nil {
		return Transcription{}, err
	}

	return transcriber.Transcribe(ctx, wav, opts)
}

func audioFileToWav(ctx context.Context, filename string) ([]byte, error) {
	if filename == "" {
		return nil, fmt.Errorf("filename is empty")
	}

	if strings.EqualFold(filepath.Ext(filename), ".wav") {
		data, err := os.ReadFile(filename)
		if err != nil {
			return nil, fmt.Errorf("read wav file: %w", err)
		}
		return data, nil
	}

	tempFile, err := os.CreateTemp("", "go-llm-audio-*.wav")
	if err != nil {
		return nil, fmt.Errorf("create temp wav file: %w", err)
	}
	tempPath := tempFile.Name()
	_ = tempFile.Close()
	defer os.Remove(tempPath)

	cmd := exec.CommandContext(ctx, "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", filename, "-vn", "-f", "wav", tempPath)
	if output, err := cmd.CombinedOutput(); err != nil {
		return nil, fmt.Errorf("ffmpeg convert failed: %w (output: %s)", err, strings.TrimSpace(string(output)))
	}

	data, err := os.ReadFile(tempPath)
	if err != nil {
		return nil, fmt.Errorf("read converted wav file: %w", err)
	}

	return data, nil
}