2024-12-07 03:53:46 -05:00
|
|
|
package extractor
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
2024-12-26 22:20:07 -05:00
|
|
|
"io"
|
2024-12-07 03:53:46 -05:00
|
|
|
"log/slog"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/playwright-community/playwright-go"
|
|
|
|
)
|
|
|
|
|
|
|
|
type playWrightBrowser struct {
|
2024-12-17 23:16:13 -05:00
|
|
|
pw *playwright.Playwright
|
2024-12-07 03:53:46 -05:00
|
|
|
browser playwright.Browser
|
|
|
|
ctx playwright.BrowserContext
|
|
|
|
userAgent string
|
|
|
|
timeout time.Duration
|
|
|
|
cookieJar CookieJar
|
|
|
|
}
|
|
|
|
|
|
|
|
var _ Browser = playWrightBrowser{}
|
|
|
|
|
|
|
|
type PlayWrightBrowserSelection string
|
|
|
|
|
|
|
|
var (
|
|
|
|
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
2024-12-23 03:18:50 -05:00
|
|
|
ErrPageNotFound = errors.New("page not found")
|
2024-12-07 03:53:46 -05:00
|
|
|
ErrInvalidStatusCode = errors.New("invalid status code")
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
|
|
|
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
|
|
|
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
|
|
|
)
|
|
|
|
|
|
|
|
type PlayWrightBrowserOptions struct {
|
2024-12-23 03:18:50 -05:00
|
|
|
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
2024-12-09 13:51:00 -05:00
|
|
|
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
2024-12-07 03:53:46 -05:00
|
|
|
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
|
|
|
|
|
|
|
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
|
|
|
// browser into the cookie jar for each request.
|
|
|
|
CookieJar
|
2024-12-23 03:18:50 -05:00
|
|
|
|
|
|
|
ShowBrowser bool // If false, browser will be headless
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
|
|
|
return playwright.OptionalCookie{
|
|
|
|
Name: cookie.Name,
|
|
|
|
Value: cookie.Value,
|
2024-12-23 03:18:50 -05:00
|
|
|
Domain: playwright.String(cookie.Host),
|
2024-12-07 03:53:46 -05:00
|
|
|
Path: playwright.String(cookie.Path),
|
|
|
|
Expires: playwright.Float(float64(cookie.Expires.Unix())),
|
|
|
|
HttpOnly: playwright.Bool(cookie.HttpOnly),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
|
|
|
return Cookie{
|
|
|
|
Name: cookie.Name,
|
|
|
|
Value: cookie.Value,
|
2024-12-23 03:18:50 -05:00
|
|
|
Host: cookie.Domain,
|
2024-12-07 03:53:46 -05:00
|
|
|
Path: cookie.Path,
|
|
|
|
Expires: time.Unix(int64(cookie.Expires), 0),
|
|
|
|
HttpOnly: cookie.HttpOnly,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|
|
|
var thirtySeconds = 30 * time.Second
|
|
|
|
opt := PlayWrightBrowserOptions{
|
2024-12-23 03:18:50 -05:00
|
|
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
2024-12-09 13:51:00 -05:00
|
|
|
Browser: PlayWrightBrowserSelectionFirefox,
|
2024-12-07 03:53:46 -05:00
|
|
|
Timeout: &thirtySeconds,
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, o := range opts {
|
|
|
|
if o.UserAgent != "" {
|
|
|
|
opt.UserAgent = o.UserAgent
|
|
|
|
}
|
|
|
|
if o.Browser != "" {
|
|
|
|
opt.Browser = o.Browser
|
|
|
|
}
|
|
|
|
if o.Timeout != nil {
|
|
|
|
opt.Timeout = o.Timeout
|
|
|
|
}
|
|
|
|
if o.CookieJar != nil {
|
|
|
|
opt.CookieJar = o.CookieJar
|
|
|
|
}
|
2024-12-23 03:18:50 -05:00
|
|
|
opt.ShowBrowser = o.ShowBrowser
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
pw, err := playwright.Run()
|
2024-12-23 03:18:50 -05:00
|
|
|
|
2024-12-07 03:53:46 -05:00
|
|
|
if err != nil {
|
2024-12-23 03:18:50 -05:00
|
|
|
err = playwright.Install()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
pw, err = playwright.Run()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
var bt playwright.BrowserType
|
|
|
|
|
|
|
|
switch opt.Browser {
|
|
|
|
case PlayWrightBrowserSelectionChromium:
|
|
|
|
bt = pw.Chromium
|
|
|
|
|
|
|
|
case PlayWrightBrowserSelectionFirefox:
|
|
|
|
bt = pw.Firefox
|
|
|
|
|
|
|
|
case PlayWrightBrowserSelectionWebKit:
|
|
|
|
bt = pw.WebKit
|
|
|
|
|
|
|
|
default:
|
|
|
|
return nil, ErrInvalidBrowserSelection
|
|
|
|
}
|
|
|
|
|
|
|
|
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
|
2024-12-23 03:18:50 -05:00
|
|
|
Headless: playwright.Bool(!opt.ShowBrowser),
|
2024-12-07 03:53:46 -05:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
|
|
|
UserAgent: playwright.String(opt.UserAgent),
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if opt.CookieJar != nil {
|
|
|
|
cookies, err := opt.CookieJar.GetAll()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
|
|
|
|
|
|
|
for i, cookie := range cookies {
|
|
|
|
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = c.AddCookies(pwCookies)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return playWrightBrowser{
|
2024-12-17 23:16:13 -05:00
|
|
|
pw: pw,
|
2024-12-07 03:53:46 -05:00
|
|
|
browser: browser,
|
|
|
|
userAgent: opt.UserAgent,
|
|
|
|
timeout: *opt.Timeout,
|
|
|
|
cookieJar: opt.CookieJar,
|
|
|
|
ctx: c,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2024-12-09 13:51:00 -05:00
|
|
|
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
|
|
|
if b.cookieJar != nil {
|
|
|
|
cookies, err := page.Context().Cookies(page.URL())
|
|
|
|
|
|
|
|
for _, cookie := range cookies {
|
|
|
|
// TODO: add support for deleting cookies from the jar which are deleted in the browser
|
|
|
|
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error setting cookie in cookie jar: %w", err)
|
|
|
|
}
|
|
|
|
}
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
2024-12-09 13:51:00 -05:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
|
2024-12-07 03:53:46 -05:00
|
|
|
page, err := b.ctx.NewPage()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
pwOpts := playwright.PageGotoOptions{
|
2024-12-07 03:53:46 -05:00
|
|
|
WaitUntil: playwright.WaitUntilStateLoad,
|
|
|
|
}
|
|
|
|
|
|
|
|
if b.timeout > 0 {
|
|
|
|
var ms = float64(b.timeout.Milliseconds())
|
2024-12-23 03:18:50 -05:00
|
|
|
pwOpts.Timeout = &ms
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.Referer != "" {
|
|
|
|
pwOpts.Referer = playwright.String(opts.Referer)
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
2024-12-23 03:18:50 -05:00
|
|
|
|
|
|
|
resp, err := page.Goto(target, pwOpts)
|
2024-12-07 03:53:46 -05:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-12-17 23:16:13 -05:00
|
|
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
2024-12-07 03:53:46 -05:00
|
|
|
|
|
|
|
if resp.Status() != 200 {
|
2024-12-23 03:18:50 -05:00
|
|
|
time.Sleep(999 * time.Hour * 24)
|
|
|
|
time.Sleep(25 * time.Second)
|
|
|
|
|
|
|
|
_ = page.Close()
|
|
|
|
|
|
|
|
if resp.Status() == 404 {
|
|
|
|
return nil, ErrPageNotFound
|
|
|
|
}
|
2024-12-09 13:51:00 -05:00
|
|
|
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
|
|
|
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
2024-12-09 13:51:00 -05:00
|
|
|
return page, nil
|
|
|
|
}
|
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
2024-12-07 03:53:46 -05:00
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
page, err := b.openPage(ctx, url, opts)
|
2024-12-07 03:53:46 -05:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-12-09 13:51:00 -05:00
|
|
|
err = b.updateCookies(ctx, page)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-12-07 03:53:46 -05:00
|
|
|
|
2024-12-17 23:16:13 -05:00
|
|
|
return newDocument(b.pw, b.browser, page)
|
2024-12-07 03:53:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func (b playWrightBrowser) Close() error {
|
|
|
|
return errors.Join(
|
|
|
|
b.ctx.Close(),
|
|
|
|
b.browser.Close(),
|
|
|
|
)
|
|
|
|
}
|
2024-12-26 22:20:07 -05:00
|
|
|
|
|
|
|
func deferClose(cl io.Closer) {
|
|
|
|
_ = cl.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
|
|
|
browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{
|
|
|
|
Timeout: &timeout,
|
|
|
|
})
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating browser: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
defer deferClose(browser)
|
|
|
|
|
|
|
|
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error opening page: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
defer deferClose(doc)
|
|
|
|
|
|
|
|
return doc.Screenshot()
|
|
|
|
}
|