sync of changes
This commit is contained in:
87
pkg/extractor/extractor.go
Normal file
87
pkg/extractor/extractor.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"answer/pkg/cache"
|
||||
"context"
|
||||
"errors"
|
||||
)
|
||||
|
||||
var ErrFailedToExtract = errors.New("failed to extract")
|
||||
|
||||
type Article struct {
|
||||
URL string
|
||||
Title string
|
||||
Body string
|
||||
}
|
||||
|
||||
// Extractor is an interface of systems that can extract the contents of
|
||||
type Extractor interface {
|
||||
Extract(ctx context.Context, url string) (Article, error)
|
||||
}
|
||||
|
||||
type multiExtractor struct {
|
||||
extractors []Extractor
|
||||
}
|
||||
|
||||
var _ Extractor = multiExtractor{}
|
||||
|
||||
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
|
||||
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
||||
var errs []error
|
||||
for _, e := range m.extractors {
|
||||
article, err := e.Extract(ctx, url)
|
||||
if err == nil {
|
||||
return article, nil
|
||||
}
|
||||
|
||||
if errors.Is(err, ErrFailedToExtract) {
|
||||
continue
|
||||
}
|
||||
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return Article{}, errors.Join(errs...)
|
||||
}
|
||||
return Article{}, ErrFailedToExtract
|
||||
}
|
||||
|
||||
func MultiExtractor(e ...Extractor) Extractor {
|
||||
return multiExtractor{extractors: e}
|
||||
}
|
||||
|
||||
type CacheExtractor struct {
|
||||
Cache cache.Cache
|
||||
Tag string
|
||||
Extractor Extractor
|
||||
}
|
||||
|
||||
var _ Extractor = CacheExtractor{}
|
||||
|
||||
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
||||
tag := c.Tag
|
||||
if tag == "" {
|
||||
tag = "defaultextractor:"
|
||||
}
|
||||
key := tag + ":" + url
|
||||
|
||||
var article Article
|
||||
|
||||
err := c.Cache.GetJSON(key, &article)
|
||||
if err == nil {
|
||||
return article, nil
|
||||
}
|
||||
|
||||
article, err = c.Extractor.Extract(ctx, url)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
err = c.Cache.SetJSON(key, article)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
return article, nil
|
||||
}
|
25
pkg/extractor/goose.go
Normal file
25
pkg/extractor/goose.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
goose "github.com/advancedlogic/GoOse"
|
||||
)
|
||||
|
||||
type GooseExtractor struct {
|
||||
}
|
||||
|
||||
func (GooseExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
||||
var res = Article{
|
||||
URL: url,
|
||||
}
|
||||
g := goose.New()
|
||||
|
||||
article, err := g.ExtractFromURL(url)
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
res.Body = article.CleanedText
|
||||
res.Title = article.Title
|
||||
return res, nil
|
||||
}
|
81
pkg/extractor/playwright.go
Normal file
81
pkg/extractor/playwright.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"os"
|
||||
)
|
||||
|
||||
type PlaywrightExtractor struct {
|
||||
}
|
||||
|
||||
var _ Extractor = PlaywrightExtractor{}
|
||||
|
||||
func getReadabilityJS() (string, error) {
|
||||
data, err := os.ReadFile("readability.js")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
|
||||
var article = Article{
|
||||
URL: url,
|
||||
}
|
||||
pw, err := playwright.Run()
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
defer pw.Stop()
|
||||
|
||||
browser, err := pw.Chromium.Launch()
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
defer browser.Close()
|
||||
|
||||
page, err := browser.NewPage()
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
_, err = page.Goto(url)
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
|
||||
// Inject Readability.js
|
||||
readabilityJS, err := getReadabilityJS()
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
|
||||
_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
|
||||
Content: &readabilityJS,
|
||||
})
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
|
||||
// Run Readability and get the article content
|
||||
content, err := page.Evaluate(`() => {
|
||||
let article = new Readability(document).parse();
|
||||
return article ? article.textContent : null;
|
||||
}`)
|
||||
if err != nil {
|
||||
return article, err
|
||||
}
|
||||
|
||||
text, ok := content.(string)
|
||||
if !ok {
|
||||
return article, fmt.Errorf("failed to convert content to string")
|
||||
}
|
||||
|
||||
article.Body = text
|
||||
article.Title, _ = page.Title()
|
||||
|
||||
return article, nil
|
||||
}
|
Reference in New Issue
Block a user