answer/pkg/agents/read_page.go

73 lines
1.3 KiB
Go

package agents
import (
"context"
"fmt"
"gitea.stevedudenhoeffer.com/steve/answer/pkg/cache"
"gitea.stevedudenhoeffer.com/steve/answer/pkg/extractor"
"net/url"
)
func (a Agent) ReadPage(ctx context.Context, u *url.URL, questions []string) (Knowledge, error) {
ar, err := extractArticle(ctx, u)
if err != nil {
return Knowledge{}, err
}
if ar.Body == "" {
return Knowledge{}, fmt.Errorf("could not extract body from page")
}
return a.ExtractKnowledge(ctx, ar.Body, u.String(), questions)
}
type article struct {
URL string
Title string
Body string
}
func extractArticle(ctx context.Context, u *url.URL) (res article, err error) {
defer func() {
e := recover()
if e != nil {
if e, ok := e.(error); ok {
err = fmt.Errorf("panic: %w", e)
} else {
err = fmt.Errorf("panic: %v", e)
}
}
}()
extractors := extractor.MultiExtractor(
extractor.CacheExtractor{
Cache: cache.Nop{},
Tag: "goose",
Extractor: extractor.GooseExtractor{},
},
extractor.CacheExtractor{
Cache: cache.Nop{},
Tag: "playwright",
Extractor: extractor.PlaywrightExtractor{},
},
)
a, err := extractors.Extract(ctx, u.String())
if err != nil {
return article{
URL: "",
Title: "",
Body: "",
}, err
}
return article{
URL: a.URL,
Title: a.Title,
Body: a.Body,
}, nil
}