package extractor import ( "context" "fmt" "github.com/playwright-community/playwright-go" "os" ) type PlaywrightExtractor struct { } var _ Extractor = PlaywrightExtractor{} func getReadabilityJS() (string, error) { data, err := os.ReadFile("readability.js") if err != nil { return "", err } return string(data), nil } func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) { var article = Article{ URL: url, } pw, err := playwright.Run() if err != nil { return article, err } defer pw.Stop() browser, err := pw.Chromium.Launch() if err != nil { return article, err } defer browser.Close() page, err := browser.NewPage() if err != nil { return article, err } defer page.Close() _, err = page.Goto(url) if err != nil { return article, err } // Inject Readability.js readabilityJS, err := getReadabilityJS() if err != nil { return article, err } _, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{ Content: &readabilityJS, }) if err != nil { return article, err } // Run Readability and get the article content content, err := page.Evaluate(`() => { let article = new Readability(document).parse(); return article ? article.textContent : null; }`) if err != nil { return article, err } text, ok := content.(string) if !ok { return article, fmt.Errorf("failed to convert content to string") } article.Body = text article.Title, _ = page.Title() return article, nil }