82 lines
1.5 KiB
Go
82 lines
1.5 KiB
Go
package extractor
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"github.com/playwright-community/playwright-go"
|
|
"os"
|
|
)
|
|
|
|
type PlaywrightExtractor struct {
|
|
}
|
|
|
|
var _ Extractor = PlaywrightExtractor{}
|
|
|
|
func getReadabilityJS() (string, error) {
|
|
data, err := os.ReadFile("readability.js")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
|
|
var article = Article{
|
|
URL: url,
|
|
}
|
|
pw, err := playwright.Run()
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
defer pw.Stop()
|
|
|
|
browser, err := pw.Chromium.Launch()
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
defer browser.Close()
|
|
|
|
page, err := browser.NewPage()
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
defer page.Close()
|
|
|
|
_, err = page.Goto(url)
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
|
|
// Inject Readability.js
|
|
readabilityJS, err := getReadabilityJS()
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
|
|
_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
|
|
Content: &readabilityJS,
|
|
})
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
|
|
// Run Readability and get the article content
|
|
content, err := page.Evaluate(`() => {
|
|
let article = new Readability(document).parse();
|
|
return article ? article.textContent : null;
|
|
}`)
|
|
if err != nil {
|
|
return article, err
|
|
}
|
|
|
|
text, ok := content.(string)
|
|
if !ok {
|
|
return article, fmt.Errorf("failed to convert content to string")
|
|
}
|
|
|
|
article.Body = text
|
|
article.Title, _ = page.Title()
|
|
|
|
return article, nil
|
|
}
|