go-extractor/playwright.go
Steve Dudenhoeffer 7c0e44a22f Add viewport dimensions and dark mode support
This commit introduces optional viewport dimensions and dark mode support to the PlayWrightBrowserOptions struct and its usage. It ensures more control over browser display settings and improves flexibility when configuring browser contexts. Additionally, visibility checking logic in SetHidden was refined to avoid redundant operations.
2025-03-15 00:46:02 -04:00

311 lines
7.0 KiB
Go

package extractor
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type playWrightBrowser struct {
pw *playwright.Playwright
browser playwright.Browser
ctx playwright.BrowserContext
userAgent string
timeout time.Duration
cookieJar CookieJar
}
var _ Browser = playWrightBrowser{}
type PlayWrightBrowserSelection string
var (
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
ErrPageNotFound = errors.New("page not found")
ErrInvalidStatusCode = errors.New("invalid status code")
)
const (
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
)
type Size struct {
Width int
Height int
}
type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
// browser into the cookie jar for each request.
CookieJar
ShowBrowser bool // If false, browser will be headless
Dimensions Size
DarkMode bool
}
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
return playwright.OptionalCookie{
Name: cookie.Name,
Value: cookie.Value,
Domain: playwright.String(cookie.Host),
Path: playwright.String(cookie.Path),
Expires: playwright.Float(float64(cookie.Expires.Unix())),
HttpOnly: playwright.Bool(cookie.HttpOnly),
}
}
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
return Cookie{
Name: cookie.Name,
Value: cookie.Value,
Host: cookie.Domain,
Path: cookie.Path,
Expires: time.Unix(int64(cookie.Expires), 0),
HttpOnly: cookie.HttpOnly,
}
}
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
var thirtySeconds = 30 * time.Second
opt := PlayWrightBrowserOptions{
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
Browser: PlayWrightBrowserSelectionFirefox,
Timeout: &thirtySeconds,
DarkMode: false,
}
for _, o := range opts {
if o.UserAgent != "" {
opt.UserAgent = o.UserAgent
}
if o.Browser != "" {
opt.Browser = o.Browser
}
if o.Timeout != nil {
opt.Timeout = o.Timeout
}
if o.CookieJar != nil {
opt.CookieJar = o.CookieJar
}
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
opt.Dimensions = o.Dimensions
}
if o.DarkMode {
opt.DarkMode = true
}
opt.ShowBrowser = o.ShowBrowser
}
pw, err := playwright.Run()
if err != nil {
err = playwright.Install()
if err != nil {
return nil, err
}
pw, err = playwright.Run()
if err != nil {
return nil, err
}
}
var bt playwright.BrowserType
switch opt.Browser {
case PlayWrightBrowserSelectionChromium:
bt = pw.Chromium
case PlayWrightBrowserSelectionFirefox:
bt = pw.Firefox
case PlayWrightBrowserSelectionWebKit:
bt = pw.WebKit
default:
return nil, ErrInvalidBrowserSelection
}
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(!opt.ShowBrowser),
})
if err != nil {
return nil, err
}
var viewport *playwright.Size
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
viewport = &playwright.Size{
Width: opt.Dimensions.Width,
Height: opt.Dimensions.Height,
}
}
var scheme *playwright.ColorScheme
if opt.DarkMode {
scheme = playwright.ColorSchemeDark
} else {
scheme = playwright.ColorSchemeNoPreference
}
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
UserAgent: playwright.String(opt.UserAgent),
Viewport: viewport,
ColorScheme: scheme,
})
if err != nil {
return nil, err
}
if opt.CookieJar != nil {
cookies, err := opt.CookieJar.GetAll()
if err != nil {
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
}
pwCookies := make([]playwright.OptionalCookie, len(cookies))
for i, cookie := range cookies {
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
}
err = c.AddCookies(pwCookies)
if err != nil {
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
}
}
return playWrightBrowser{
pw: pw,
browser: browser,
userAgent: opt.UserAgent,
timeout: *opt.Timeout,
cookieJar: opt.CookieJar,
ctx: c,
}, nil
}
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
if b.cookieJar != nil {
cookies, err := page.Context().Cookies(page.URL())
for _, cookie := range cookies {
// TODO: add support for deleting cookies from the jar which are deleted in the browser
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
if err != nil {
return fmt.Errorf("error setting cookie in cookie jar: %w", err)
}
}
}
return nil
}
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
page, err := b.ctx.NewPage()
if err != nil {
return nil, err
}
pwOpts := playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateLoad,
}
if b.timeout > 0 {
var ms = float64(b.timeout.Milliseconds())
pwOpts.Timeout = &ms
}
if opts.Referer != "" {
pwOpts.Referer = playwright.String(opts.Referer)
}
resp, err := page.Goto(target, pwOpts)
if err != nil {
return nil, err
}
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
if resp.Status() != 200 {
time.Sleep(999 * time.Hour * 24)
time.Sleep(25 * time.Second)
_ = page.Close()
if resp.Status() == 404 {
return nil, ErrPageNotFound
}
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
}
return page, nil
}
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
page, err := b.openPage(ctx, url, opts)
if err != nil {
return nil, err
}
err = b.updateCookies(ctx, page)
if err != nil {
return nil, err
}
return newDocument(b.pw, b.browser, page)
}
func (b playWrightBrowser) Close() error {
return errors.Join(
b.browser.Close(),
b.ctx.Close(),
b.pw.Stop(),
)
}
func deferClose(cl io.Closer) {
_ = cl.Close()
}
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{
Timeout: &timeout,
})
if err != nil {
return nil, fmt.Errorf("error creating browser: %w", err)
}
defer deferClose(browser)
doc, err := browser.Open(ctx, target, OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("error opening page: %w", err)
}
defer deferClose(doc)
return doc.Screenshot()
}