sync of changes

This commit is contained in:
2024-11-09 19:50:14 -05:00
parent cc7b03c614
commit a83d5f9822
9 changed files with 491 additions and 95 deletions

View File

@@ -0,0 +1,87 @@
package extractor
import (
"answer/pkg/cache"
"context"
"errors"
)
var ErrFailedToExtract = errors.New("failed to extract")
type Article struct {
URL string
Title string
Body string
}
// Extractor is an interface of systems that can extract the contents of
type Extractor interface {
Extract(ctx context.Context, url string) (Article, error)
}
type multiExtractor struct {
extractors []Extractor
}
var _ Extractor = multiExtractor{}
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
var errs []error
for _, e := range m.extractors {
article, err := e.Extract(ctx, url)
if err == nil {
return article, nil
}
if errors.Is(err, ErrFailedToExtract) {
continue
}
errs = append(errs, err)
}
if len(errs) > 0 {
return Article{}, errors.Join(errs...)
}
return Article{}, ErrFailedToExtract
}
func MultiExtractor(e ...Extractor) Extractor {
return multiExtractor{extractors: e}
}
type CacheExtractor struct {
Cache cache.Cache
Tag string
Extractor Extractor
}
var _ Extractor = CacheExtractor{}
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
tag := c.Tag
if tag == "" {
tag = "defaultextractor:"
}
key := tag + ":" + url
var article Article
err := c.Cache.GetJSON(key, &article)
if err == nil {
return article, nil
}
article, err = c.Extractor.Extract(ctx, url)
if err != nil {
return Article{}, err
}
err = c.Cache.SetJSON(key, article)
if err != nil {
return Article{}, err
}
return article, nil
}

25
pkg/extractor/goose.go Normal file
View File

@@ -0,0 +1,25 @@
package extractor
import (
"context"
goose "github.com/advancedlogic/GoOse"
)
type GooseExtractor struct {
}
func (GooseExtractor) Extract(ctx context.Context, url string) (Article, error) {
var res = Article{
URL: url,
}
g := goose.New()
article, err := g.ExtractFromURL(url)
if err != nil {
return res, err
}
res.Body = article.CleanedText
res.Title = article.Title
return res, nil
}

View File

@@ -0,0 +1,81 @@
package extractor
import (
"context"
"fmt"
"github.com/playwright-community/playwright-go"
"os"
)
type PlaywrightExtractor struct {
}
var _ Extractor = PlaywrightExtractor{}
func getReadabilityJS() (string, error) {
data, err := os.ReadFile("readability.js")
if err != nil {
return "", err
}
return string(data), nil
}
func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
var article = Article{
URL: url,
}
pw, err := playwright.Run()
if err != nil {
return article, err
}
defer pw.Stop()
browser, err := pw.Chromium.Launch()
if err != nil {
return article, err
}
defer browser.Close()
page, err := browser.NewPage()
if err != nil {
return article, err
}
defer page.Close()
_, err = page.Goto(url)
if err != nil {
return article, err
}
// Inject Readability.js
readabilityJS, err := getReadabilityJS()
if err != nil {
return article, err
}
_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
Content: &readabilityJS,
})
if err != nil {
return article, err
}
// Run Readability and get the article content
content, err := page.Evaluate(`() => {
let article = new Readability(document).parse();
return article ? article.textContent : null;
}`)
if err != nil {
return article, err
}
text, ok := content.(string)
if !ok {
return article, fmt.Errorf("failed to convert content to string")
}
article.Body = text
article.Title, _ = page.Title()
return article, nil
}