This commit introduces optional viewport dimensions and dark mode support to the PlayWrightBrowserOptions struct and its usage. It ensures more control over browser display settings and improves flexibility when configuring browser contexts. Additionally, visibility checking logic in SetHidden was refined to avoid redundant operations.
311 lines
7.0 KiB
Go
311 lines
7.0 KiB
Go
package extractor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/playwright-community/playwright-go"
|
|
)
|
|
|
|
type playWrightBrowser struct {
|
|
pw *playwright.Playwright
|
|
browser playwright.Browser
|
|
ctx playwright.BrowserContext
|
|
userAgent string
|
|
timeout time.Duration
|
|
cookieJar CookieJar
|
|
}
|
|
|
|
var _ Browser = playWrightBrowser{}
|
|
|
|
type PlayWrightBrowserSelection string
|
|
|
|
var (
|
|
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
|
ErrPageNotFound = errors.New("page not found")
|
|
ErrInvalidStatusCode = errors.New("invalid status code")
|
|
)
|
|
|
|
const (
|
|
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
|
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
|
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
|
)
|
|
|
|
type Size struct {
|
|
Width int
|
|
Height int
|
|
}
|
|
type PlayWrightBrowserOptions struct {
|
|
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
|
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
|
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
|
|
|
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
|
// browser into the cookie jar for each request.
|
|
CookieJar
|
|
|
|
ShowBrowser bool // If false, browser will be headless
|
|
|
|
Dimensions Size
|
|
DarkMode bool
|
|
}
|
|
|
|
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
|
return playwright.OptionalCookie{
|
|
Name: cookie.Name,
|
|
Value: cookie.Value,
|
|
Domain: playwright.String(cookie.Host),
|
|
Path: playwright.String(cookie.Path),
|
|
Expires: playwright.Float(float64(cookie.Expires.Unix())),
|
|
HttpOnly: playwright.Bool(cookie.HttpOnly),
|
|
}
|
|
}
|
|
|
|
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
|
return Cookie{
|
|
Name: cookie.Name,
|
|
Value: cookie.Value,
|
|
Host: cookie.Domain,
|
|
Path: cookie.Path,
|
|
Expires: time.Unix(int64(cookie.Expires), 0),
|
|
HttpOnly: cookie.HttpOnly,
|
|
}
|
|
}
|
|
|
|
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|
var thirtySeconds = 30 * time.Second
|
|
opt := PlayWrightBrowserOptions{
|
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
|
Browser: PlayWrightBrowserSelectionFirefox,
|
|
Timeout: &thirtySeconds,
|
|
DarkMode: false,
|
|
}
|
|
|
|
for _, o := range opts {
|
|
if o.UserAgent != "" {
|
|
opt.UserAgent = o.UserAgent
|
|
}
|
|
if o.Browser != "" {
|
|
opt.Browser = o.Browser
|
|
}
|
|
if o.Timeout != nil {
|
|
opt.Timeout = o.Timeout
|
|
}
|
|
if o.CookieJar != nil {
|
|
opt.CookieJar = o.CookieJar
|
|
}
|
|
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
|
opt.Dimensions = o.Dimensions
|
|
}
|
|
if o.DarkMode {
|
|
opt.DarkMode = true
|
|
}
|
|
|
|
opt.ShowBrowser = o.ShowBrowser
|
|
}
|
|
|
|
pw, err := playwright.Run()
|
|
|
|
if err != nil {
|
|
err = playwright.Install()
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pw, err = playwright.Run()
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
var bt playwright.BrowserType
|
|
|
|
switch opt.Browser {
|
|
case PlayWrightBrowserSelectionChromium:
|
|
bt = pw.Chromium
|
|
|
|
case PlayWrightBrowserSelectionFirefox:
|
|
bt = pw.Firefox
|
|
|
|
case PlayWrightBrowserSelectionWebKit:
|
|
bt = pw.WebKit
|
|
|
|
default:
|
|
return nil, ErrInvalidBrowserSelection
|
|
}
|
|
|
|
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
|
|
Headless: playwright.Bool(!opt.ShowBrowser),
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var viewport *playwright.Size
|
|
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
|
viewport = &playwright.Size{
|
|
Width: opt.Dimensions.Width,
|
|
Height: opt.Dimensions.Height,
|
|
}
|
|
}
|
|
|
|
var scheme *playwright.ColorScheme
|
|
|
|
if opt.DarkMode {
|
|
scheme = playwright.ColorSchemeDark
|
|
} else {
|
|
scheme = playwright.ColorSchemeNoPreference
|
|
}
|
|
|
|
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
|
UserAgent: playwright.String(opt.UserAgent),
|
|
Viewport: viewport,
|
|
ColorScheme: scheme,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if opt.CookieJar != nil {
|
|
cookies, err := opt.CookieJar.GetAll()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
|
}
|
|
|
|
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
|
|
|
for i, cookie := range cookies {
|
|
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
|
}
|
|
|
|
err = c.AddCookies(pwCookies)
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
|
}
|
|
}
|
|
|
|
return playWrightBrowser{
|
|
pw: pw,
|
|
browser: browser,
|
|
userAgent: opt.UserAgent,
|
|
timeout: *opt.Timeout,
|
|
cookieJar: opt.CookieJar,
|
|
ctx: c,
|
|
}, nil
|
|
}
|
|
|
|
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
|
if b.cookieJar != nil {
|
|
cookies, err := page.Context().Cookies(page.URL())
|
|
|
|
for _, cookie := range cookies {
|
|
// TODO: add support for deleting cookies from the jar which are deleted in the browser
|
|
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("error setting cookie in cookie jar: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
|
|
page, err := b.ctx.NewPage()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pwOpts := playwright.PageGotoOptions{
|
|
WaitUntil: playwright.WaitUntilStateLoad,
|
|
}
|
|
|
|
if b.timeout > 0 {
|
|
var ms = float64(b.timeout.Milliseconds())
|
|
pwOpts.Timeout = &ms
|
|
}
|
|
|
|
if opts.Referer != "" {
|
|
pwOpts.Referer = playwright.String(opts.Referer)
|
|
}
|
|
|
|
resp, err := page.Goto(target, pwOpts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
|
|
|
if resp.Status() != 200 {
|
|
time.Sleep(999 * time.Hour * 24)
|
|
time.Sleep(25 * time.Second)
|
|
|
|
_ = page.Close()
|
|
|
|
if resp.Status() == 404 {
|
|
return nil, ErrPageNotFound
|
|
}
|
|
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
|
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
|
}
|
|
|
|
return page, nil
|
|
}
|
|
|
|
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
|
|
|
page, err := b.openPage(ctx, url, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = b.updateCookies(ctx, page)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return newDocument(b.pw, b.browser, page)
|
|
}
|
|
|
|
func (b playWrightBrowser) Close() error {
|
|
return errors.Join(
|
|
b.browser.Close(),
|
|
b.ctx.Close(),
|
|
b.pw.Stop(),
|
|
)
|
|
}
|
|
|
|
func deferClose(cl io.Closer) {
|
|
_ = cl.Close()
|
|
}
|
|
|
|
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
|
browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{
|
|
Timeout: &timeout,
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating browser: %w", err)
|
|
}
|
|
|
|
defer deferClose(browser)
|
|
|
|
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error opening page: %w", err)
|
|
}
|
|
|
|
defer deferClose(doc)
|
|
|
|
return doc.Screenshot()
|
|
}
|