73 lines
1.3 KiB
Go
73 lines
1.3 KiB
Go
package agents
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"gitea.stevedudenhoeffer.com/steve/answer/pkg/cache"
|
|
"gitea.stevedudenhoeffer.com/steve/answer/pkg/extractor"
|
|
"net/url"
|
|
)
|
|
|
|
func (a Agent) ReadPage(ctx context.Context, u *url.URL, questions []string) (Knowledge, error) {
|
|
ar, err := extractArticle(ctx, u)
|
|
if err != nil {
|
|
return Knowledge{}, err
|
|
}
|
|
|
|
if ar.Body == "" {
|
|
return Knowledge{}, fmt.Errorf("could not extract body from page")
|
|
}
|
|
|
|
return a.ExtractKnowledge(ctx, ar.Body, u.String(), questions)
|
|
|
|
}
|
|
|
|
type article struct {
|
|
URL string
|
|
Title string
|
|
Body string
|
|
}
|
|
|
|
func extractArticle(ctx context.Context, u *url.URL) (res article, err error) {
|
|
defer func() {
|
|
e := recover()
|
|
|
|
if e != nil {
|
|
if e, ok := e.(error); ok {
|
|
err = fmt.Errorf("panic: %w", e)
|
|
} else {
|
|
err = fmt.Errorf("panic: %v", e)
|
|
}
|
|
}
|
|
}()
|
|
|
|
extractors := extractor.MultiExtractor(
|
|
extractor.CacheExtractor{
|
|
Cache: cache.Nop{},
|
|
Tag: "goose",
|
|
Extractor: extractor.GooseExtractor{},
|
|
},
|
|
extractor.CacheExtractor{
|
|
Cache: cache.Nop{},
|
|
Tag: "playwright",
|
|
Extractor: extractor.PlaywrightExtractor{},
|
|
},
|
|
)
|
|
|
|
a, err := extractors.Extract(ctx, u.String())
|
|
|
|
if err != nil {
|
|
return article{
|
|
URL: "",
|
|
Title: "",
|
|
Body: "",
|
|
}, err
|
|
}
|
|
|
|
return article{
|
|
URL: a.URL,
|
|
Title: a.Title,
|
|
Body: a.Body,
|
|
}, nil
|
|
}
|