82 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			82 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package extractor
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"github.com/playwright-community/playwright-go"
 | |
| 	"os"
 | |
| )
 | |
| 
 | |
| type PlaywrightExtractor struct {
 | |
| }
 | |
| 
 | |
| var _ Extractor = PlaywrightExtractor{}
 | |
| 
 | |
| func getReadabilityJS() (string, error) {
 | |
| 	data, err := os.ReadFile("readability.js")
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return string(data), nil
 | |
| }
 | |
| 
 | |
| func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
 | |
| 	var article = Article{
 | |
| 		URL: url,
 | |
| 	}
 | |
| 	pw, err := playwright.Run()
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 	defer pw.Stop()
 | |
| 
 | |
| 	browser, err := pw.Chromium.Launch()
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 	defer browser.Close()
 | |
| 
 | |
| 	page, err := browser.NewPage()
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 	defer page.Close()
 | |
| 
 | |
| 	_, err = page.Goto(url)
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 
 | |
| 	// Inject Readability.js
 | |
| 	readabilityJS, err := getReadabilityJS()
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 
 | |
| 	_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
 | |
| 		Content: &readabilityJS,
 | |
| 	})
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 
 | |
| 	// Run Readability and get the article content
 | |
| 	content, err := page.Evaluate(`() => {
 | |
|         let article = new Readability(document).parse();
 | |
|         return article ? article.textContent : null;
 | |
|     }`)
 | |
| 	if err != nil {
 | |
| 		return article, err
 | |
| 	}
 | |
| 
 | |
| 	text, ok := content.(string)
 | |
| 	if !ok {
 | |
| 		return article, fmt.Errorf("failed to convert content to string")
 | |
| 	}
 | |
| 
 | |
| 	article.Body = text
 | |
| 	article.Title, _ = page.Title()
 | |
| 
 | |
| 	return article, nil
 | |
| }
 |