answer/pkg/extractor/playwright.go

82 lines
1.5 KiB
Go

package extractor
import (
"context"
"fmt"
"github.com/playwright-community/playwright-go"
"os"
)
type PlaywrightExtractor struct {
}
var _ Extractor = PlaywrightExtractor{}
func getReadabilityJS() (string, error) {
data, err := os.ReadFile("readability.js")
if err != nil {
return "", err
}
return string(data), nil
}
func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
var article = Article{
URL: url,
}
pw, err := playwright.Run()
if err != nil {
return article, err
}
defer pw.Stop()
browser, err := pw.Chromium.Launch()
if err != nil {
return article, err
}
defer browser.Close()
page, err := browser.NewPage()
if err != nil {
return article, err
}
defer page.Close()
_, err = page.Goto(url)
if err != nil {
return article, err
}
// Inject Readability.js
readabilityJS, err := getReadabilityJS()
if err != nil {
return article, err
}
_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
Content: &readabilityJS,
})
if err != nil {
return article, err
}
// Run Readability and get the article content
content, err := page.Evaluate(`() => {
let article = new Readability(document).parse();
return article ? article.textContent : null;
}`)
if err != nil {
return article, err
}
text, ok := content.(string)
if !ok {
return article, fmt.Errorf("failed to convert content to string")
}
article.Body = text
article.Title, _ = page.Title()
return article, nil
}