NewBrowser previously had no viewport (strong headless signal) and used a Firefox User-Agent unconditionally, even for Chromium instances (detectable mismatch). Add per-engine UA constants (DefaultFirefoxUserAgent, DefaultChromiumUserAgent) and auto-select the matching UA in initBrowser when the caller hasn't set one explicitly. Keep DefaultUserAgent as a backward-compatible alias. Add 1920x1080 default viewport to NewBrowser (most common desktop resolution). NewInteractiveBrowser keeps its existing 1280x720 default but also gains engine-aware UA selection. Closes #70 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
305 lines
7.8 KiB
Go
305 lines
7.8 KiB
Go
package extractor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/playwright-community/playwright-go"
|
|
)
|
|
|
|
type playWrightBrowser struct {
|
|
pw *playwright.Playwright
|
|
browser playwright.Browser
|
|
ctx playwright.BrowserContext
|
|
userAgent string
|
|
timeout time.Duration
|
|
cookieJar CookieJar
|
|
serverAddr string
|
|
}
|
|
|
|
var _ Browser = playWrightBrowser{}
|
|
|
|
type BrowserSelection string
|
|
|
|
var (
|
|
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
|
ErrPageNotFound = errors.New("page not found")
|
|
ErrInvalidStatusCode = errors.New("invalid status code")
|
|
)
|
|
|
|
const (
|
|
BrowserChromium BrowserSelection = "chromium"
|
|
BrowserFirefox BrowserSelection = "firefox"
|
|
BrowserWebKit BrowserSelection = "webkit"
|
|
)
|
|
|
|
// DefaultFirefoxUserAgent is the user-agent string used for Firefox browser instances.
|
|
const DefaultFirefoxUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0"
|
|
|
|
// DefaultChromiumUserAgent is the user-agent string used for Chromium browser instances.
|
|
const DefaultChromiumUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
|
|
// DefaultUserAgent is an alias for DefaultFirefoxUserAgent, retained for backward compatibility.
|
|
const DefaultUserAgent = DefaultFirefoxUserAgent
|
|
|
|
// Bool returns a pointer to the given bool value.
|
|
func Bool(v bool) *bool { return &v }
|
|
|
|
type Size struct {
|
|
Width int
|
|
Height int
|
|
}
|
|
type BrowserOptions struct {
|
|
UserAgent string // If empty, auto-selected based on Browser engine
|
|
Browser BrowserSelection // If unset defaults to Firefox.
|
|
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
|
|
|
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
|
// browser into the cookie jar for each request.
|
|
CookieJar
|
|
|
|
ShowBrowser *bool // If nil, defaults to false (headless). Set to ptr to override.
|
|
|
|
Dimensions Size
|
|
DarkMode bool
|
|
|
|
// ServerAddress is the address of a Playwright server to connect to.
|
|
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
|
|
ServerAddress string
|
|
|
|
// RequireServer will, if set, return an error if the connection to the
|
|
// Playwright server fails instead of falling back to a local browser launch.
|
|
RequireServer bool
|
|
|
|
// UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser.
|
|
UseLocalOnly bool
|
|
|
|
// LaunchArgs are additional command-line arguments passed to the browser process.
|
|
// For example: []string{"--disable-blink-features=AutomationControlled"}
|
|
LaunchArgs []string
|
|
|
|
// InitScripts are JavaScript snippets injected into every new browser context
|
|
// before any page scripts run. Useful for overriding detectable properties like
|
|
// navigator.webdriver.
|
|
InitScripts []string
|
|
|
|
// Stealth enables anti-bot-detection measures. When non-nil and true, common
|
|
// evasions are applied automatically (launch args + init scripts). When nil,
|
|
// defaults to true in NewBrowser / NewInteractiveBrowser.
|
|
Stealth *bool
|
|
}
|
|
|
|
func sameSiteToPlaywright(s SameSite) *playwright.SameSiteAttribute {
|
|
switch s {
|
|
case SameSiteStrict:
|
|
return playwright.SameSiteAttributeStrict
|
|
case SameSiteLax:
|
|
return playwright.SameSiteAttributeLax
|
|
case SameSiteNone:
|
|
return playwright.SameSiteAttributeNone
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func playwrightSameSiteToSameSite(s *playwright.SameSiteAttribute) SameSite {
|
|
if s == nil {
|
|
return ""
|
|
}
|
|
switch *s {
|
|
case *playwright.SameSiteAttributeStrict:
|
|
return SameSiteStrict
|
|
case *playwright.SameSiteAttributeLax:
|
|
return SameSiteLax
|
|
case *playwright.SameSiteAttributeNone:
|
|
return SameSiteNone
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
|
oc := playwright.OptionalCookie{
|
|
Name: cookie.Name,
|
|
Value: cookie.Value,
|
|
Domain: playwright.String(cookie.Host),
|
|
Path: playwright.String(cookie.Path),
|
|
Expires: playwright.Float(float64(cookie.Expires.Unix())),
|
|
HttpOnly: playwright.Bool(cookie.HttpOnly),
|
|
}
|
|
if cookie.SameSite != "" {
|
|
oc.SameSite = sameSiteToPlaywright(cookie.SameSite)
|
|
}
|
|
return oc
|
|
}
|
|
|
|
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
|
return Cookie{
|
|
Name: cookie.Name,
|
|
Value: cookie.Value,
|
|
Host: cookie.Domain,
|
|
Path: cookie.Path,
|
|
Expires: time.Unix(int64(cookie.Expires), 0),
|
|
HttpOnly: cookie.HttpOnly,
|
|
SameSite: playwrightSameSiteToSameSite(cookie.SameSite),
|
|
}
|
|
}
|
|
|
|
func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) {
|
|
var thirtySeconds = 30 * time.Second
|
|
opt := mergeOptions(BrowserOptions{
|
|
Browser: BrowserFirefox,
|
|
Timeout: &thirtySeconds,
|
|
Stealth: Bool(true),
|
|
Dimensions: Size{
|
|
Width: 1920,
|
|
Height: 1080,
|
|
},
|
|
}, opts)
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
type browserResult struct {
|
|
browser Browser
|
|
err error
|
|
}
|
|
|
|
resultCh := make(chan browserResult, 1)
|
|
|
|
go func() {
|
|
res, err := initBrowser(opt)
|
|
if err != nil {
|
|
resultCh <- browserResult{nil, err}
|
|
return
|
|
}
|
|
|
|
resultCh <- browserResult{
|
|
browser: playWrightBrowser{
|
|
pw: res.pw,
|
|
browser: res.browser,
|
|
userAgent: res.opt.UserAgent,
|
|
timeout: *res.opt.Timeout,
|
|
cookieJar: res.opt.CookieJar,
|
|
ctx: res.bctx,
|
|
serverAddr: res.opt.ServerAddress,
|
|
},
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
case result := <-resultCh:
|
|
return result.browser, result.err
|
|
}
|
|
}
|
|
|
|
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
|
if b.cookieJar != nil {
|
|
cookies, err := page.Context().Cookies(page.URL())
|
|
if err != nil {
|
|
return fmt.Errorf("error getting cookies from browser: %w", err)
|
|
}
|
|
|
|
for _, cookie := range cookies {
|
|
// TODO: add support for deleting cookies from the jar which are deleted in the browser
|
|
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("error setting cookie in cookie jar: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
|
|
page, err := b.ctx.NewPage()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pwOpts := playwright.PageGotoOptions{
|
|
WaitUntil: playwright.WaitUntilStateLoad,
|
|
}
|
|
|
|
if b.timeout > 0 {
|
|
var ms = float64(b.timeout.Milliseconds())
|
|
pwOpts.Timeout = &ms
|
|
}
|
|
|
|
if opts.Referer != "" {
|
|
pwOpts.Referer = playwright.String(opts.Referer)
|
|
}
|
|
|
|
resp, err := page.Goto(target, pwOpts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
|
|
|
if resp.Status() < 200 || resp.Status() >= 300 {
|
|
_ = page.Close()
|
|
|
|
if resp.Status() == 404 {
|
|
return nil, ErrPageNotFound
|
|
}
|
|
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
|
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
|
}
|
|
|
|
return page, nil
|
|
}
|
|
|
|
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
|
|
|
page, err := b.openPage(ctx, url, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = b.updateCookies(ctx, page)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return newDocument(b.pw, b.browser, page)
|
|
}
|
|
|
|
func (b playWrightBrowser) Close() error {
|
|
return errors.Join(
|
|
b.browser.Close(),
|
|
b.ctx.Close(),
|
|
b.pw.Stop(),
|
|
)
|
|
}
|
|
|
|
|
|
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
|
browser, err := NewBrowser(ctx, BrowserOptions{
|
|
Timeout: &timeout,
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating browser: %w", err)
|
|
}
|
|
|
|
defer DeferClose(browser)
|
|
|
|
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error opening page: %w", err)
|
|
}
|
|
|
|
defer DeferClose(doc)
|
|
|
|
return doc.Screenshot()
|
|
}
|