Introduced a new tool for extracting YouTube video transcripts and leveraging them to answer questions. Updated `SearchAndRead` to support reading YouTube transcripts and regular pages distinctly. Included relevant dependencies for handling subtitles and video downloads.
105 lines
2.3 KiB
Go
105 lines
2.3 KiB
Go
package agents
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"github.com/asticode/go-astisub"
|
|
"github.com/lrstanley/go-ytdlp"
|
|
"io"
|
|
"log/slog"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
)
|
|
|
|
func init() {
|
|
ytdlp.MustInstall(context.Background(), nil)
|
|
}
|
|
|
|
func (a Agent) ReadYouTubeTranscript(ctx context.Context, u *url.URL, questions []string) (Knowledge, error) {
|
|
dlp := ytdlp.New()
|
|
|
|
tmpDir, err := os.MkdirTemp("", "mort-ytdlp-")
|
|
if err != nil {
|
|
return Knowledge{}, fmt.Errorf("error creating temp dir: %w", err)
|
|
}
|
|
|
|
slog.Info("created temp dir", "path", tmpDir)
|
|
defer func(path string) {
|
|
err := os.RemoveAll(path)
|
|
if err != nil {
|
|
slog.Error("error removing temp file", "error", err)
|
|
}
|
|
}(tmpDir)
|
|
|
|
subFile := filepath.Join(tmpDir, "subs")
|
|
dlp.
|
|
SkipDownload().
|
|
WriteAutoSubs().
|
|
Output(subFile)
|
|
|
|
res, err := dlp.Run(ctx, u.String())
|
|
if err != nil {
|
|
return Knowledge{}, fmt.Errorf("error running yt-dlp: %w", err)
|
|
}
|
|
|
|
if res == nil {
|
|
return Knowledge{}, fmt.Errorf("yt-dlp returned nil")
|
|
}
|
|
|
|
if res.ExitCode != 0 {
|
|
return Knowledge{}, fmt.Errorf("yt-dlp exited with code %d", res.ExitCode)
|
|
}
|
|
|
|
// the transcript for this video now _should_ be at tmpDir/subs.en.vtt, however if it's not then just fine any
|
|
// vtt file in the directory
|
|
vttFile := filepath.Join(tmpDir, "subs.en.vtt")
|
|
|
|
_, err = os.Stat(vttFile)
|
|
if os.IsNotExist(err) {
|
|
vttFile = ""
|
|
files, err := os.ReadDir(tmpDir)
|
|
if err != nil {
|
|
return Knowledge{}, fmt.Errorf("error reading directory: %w", err)
|
|
}
|
|
|
|
for _, file := range files {
|
|
if filepath.Ext(file.Name()) == ".vtt" {
|
|
vttFile = filepath.Join(tmpDir, file.Name())
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if vttFile == "" {
|
|
return Knowledge{}, fmt.Errorf("no vtt file found")
|
|
}
|
|
|
|
fp, err := os.Open(vttFile)
|
|
defer func(cl io.Closer) {
|
|
err := cl.Close()
|
|
if err != nil {
|
|
slog.Error("error closing file", "error", err)
|
|
}
|
|
}(fp)
|
|
if err != nil {
|
|
return Knowledge{}, fmt.Errorf("error opening vtt file: %w", err)
|
|
}
|
|
|
|
subs, err := astisub.ReadFromWebVTT(fp)
|
|
if err != nil {
|
|
return Knowledge{}, fmt.Errorf("error reading vtt file: %w", err)
|
|
}
|
|
|
|
if len(subs.Items) == 0 {
|
|
return Knowledge{}, fmt.Errorf("no subtitles found")
|
|
}
|
|
|
|
var ts string
|
|
for _, item := range subs.Items {
|
|
ts += item.String() + "\n"
|
|
}
|
|
|
|
return a.ExtractKnowledge(ctx, ts, u.String(), questions)
|
|
}
|