Files
go-extractor/playwright.go
Steve Dudenhoeffer 3b6d864330
All checks were successful
CI / build (push) Successful in 1m18s
CI / vet (push) Successful in 1m17s
CI / test (push) Successful in 1m19s
Preserve cookie security attributes in updateCookies round-trip
Chromium's Cookies() API can lose or normalize Secure, SameSite, and
HttpOnly attributes during the AddCookies → navigate → Cookies()
round-trip. This caused cookies like cf_clearance (set with
Secure=true, SameSite=None) to be overwritten with Chromium's defaults
(Secure=false, SameSite=Lax).

Now updateCookies() looks up existing cookies in the jar first. For
cookies that already exist, only Value and Expires are updated —
security attributes are preserved from the original. New cookies from
the server are still written with all their attributes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 02:37:47 +00:00

343 lines
9.1 KiB
Go

package extractor
import (
"context"
"errors"
"fmt"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type playWrightBrowser struct {
pw *playwright.Playwright
browser playwright.Browser
ctx playwright.BrowserContext
userAgent string
timeout time.Duration
cookieJar CookieJar
serverAddr string
}
var _ Browser = playWrightBrowser{}
type BrowserSelection string
var (
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
ErrPageNotFound = errors.New("page not found")
ErrInvalidStatusCode = errors.New("invalid status code")
)
const (
BrowserChromium BrowserSelection = "chromium"
BrowserFirefox BrowserSelection = "firefox"
BrowserWebKit BrowserSelection = "webkit"
)
// DefaultFirefoxUserAgent is the user-agent string used for Firefox browser instances.
const DefaultFirefoxUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0"
// DefaultChromiumUserAgent is the user-agent string used for Chromium browser instances.
const DefaultChromiumUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
// DefaultUserAgent is an alias for DefaultFirefoxUserAgent, retained for backward compatibility.
const DefaultUserAgent = DefaultFirefoxUserAgent
// Bool returns a pointer to the given bool value.
func Bool(v bool) *bool { return &v }
type Size struct {
Width int
Height int
}
type BrowserOptions struct {
UserAgent string // If empty, auto-selected based on Browser engine
Browser BrowserSelection // If unset defaults to Firefox.
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
// browser into the cookie jar for each request.
CookieJar
ShowBrowser *bool // If nil, defaults to false (headless). Set to ptr to override.
Dimensions Size
DarkMode bool
// ServerAddress is the address of a Playwright server to connect to.
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
ServerAddress string
// RequireServer will, if set, return an error if the connection to the
// Playwright server fails instead of falling back to a local browser launch.
RequireServer bool
// UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser.
UseLocalOnly bool
// LaunchArgs are additional command-line arguments passed to the browser process.
// For example: []string{"--disable-blink-features=AutomationControlled"}
LaunchArgs []string
// InitScripts are JavaScript snippets injected into every new browser context
// before any page scripts run. Useful for overriding detectable properties like
// navigator.webdriver.
InitScripts []string
// Stealth enables anti-bot-detection measures. When non-nil and true, common
// evasions are applied automatically (launch args + init scripts). When nil,
// defaults to true in NewBrowser / NewInteractiveBrowser.
Stealth *bool
}
func sameSiteToPlaywright(s SameSite) *playwright.SameSiteAttribute {
switch s {
case SameSiteStrict:
return playwright.SameSiteAttributeStrict
case SameSiteLax:
return playwright.SameSiteAttributeLax
case SameSiteNone:
return playwright.SameSiteAttributeNone
default:
return nil
}
}
func playwrightSameSiteToSameSite(s *playwright.SameSiteAttribute) SameSite {
if s == nil {
return ""
}
switch *s {
case *playwright.SameSiteAttributeStrict:
return SameSiteStrict
case *playwright.SameSiteAttributeLax:
return SameSiteLax
case *playwright.SameSiteAttributeNone:
return SameSiteNone
default:
return ""
}
}
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
expires := float64(cookie.Expires.Unix())
if cookie.Expires.IsZero() || expires <= 0 {
expires = -1
}
oc := playwright.OptionalCookie{
Name: cookie.Name,
Value: cookie.Value,
Domain: playwright.String(cookie.Host),
Path: playwright.String(cookie.Path),
Expires: playwright.Float(expires),
Secure: playwright.Bool(cookie.Secure),
HttpOnly: playwright.Bool(cookie.HttpOnly),
}
if cookie.SameSite != "" {
oc.SameSite = sameSiteToPlaywright(cookie.SameSite)
}
return oc
}
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
return Cookie{
Name: cookie.Name,
Value: cookie.Value,
Host: cookie.Domain,
Path: cookie.Path,
Expires: time.Unix(int64(cookie.Expires), 0),
Secure: cookie.Secure,
HttpOnly: cookie.HttpOnly,
SameSite: playwrightSameSiteToSameSite(cookie.SameSite),
}
}
func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) {
var thirtySeconds = 30 * time.Second
opt := mergeOptions(BrowserOptions{
Browser: BrowserFirefox,
Timeout: &thirtySeconds,
Stealth: Bool(true),
Dimensions: Size{
Width: 1920,
Height: 1080,
},
}, opts)
if err := ctx.Err(); err != nil {
return nil, err
}
type browserResult struct {
browser Browser
err error
}
resultCh := make(chan browserResult, 1)
go func() {
res, err := initBrowser(opt)
if err != nil {
resultCh <- browserResult{nil, err}
return
}
resultCh <- browserResult{
browser: playWrightBrowser{
pw: res.pw,
browser: res.browser,
userAgent: res.opt.UserAgent,
timeout: *res.opt.Timeout,
cookieJar: res.opt.CookieJar,
ctx: res.bctx,
serverAddr: res.opt.ServerAddress,
},
}
}()
select {
case <-ctx.Done():
go func() {
r := <-resultCh
if r.err == nil && r.browser != nil {
_ = r.browser.Close()
}
}()
return nil, ctx.Err()
case result := <-resultCh:
return result.browser, result.err
}
}
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
if b.cookieJar != nil {
cookies, err := page.Context().Cookies(page.URL())
if err != nil {
return fmt.Errorf("error getting cookies from browser: %w", err)
}
// Build a lookup of existing cookies so we can preserve their security
// attributes. Chromium's Cookies() API can lose or normalize Secure,
// SameSite, and HttpOnly during the AddCookies → navigate → Cookies()
// round-trip, so we only update Value and Expires for cookies that
// already exist in the jar.
existing, err := b.cookieJar.Get(page.URL())
if err != nil {
return fmt.Errorf("error getting existing cookies from jar: %w", err)
}
type cookieKey struct{ Name, Path string }
existingMap := make(map[cookieKey]Cookie, len(existing))
for _, c := range existing {
existingMap[cookieKey{c.Name, c.Path}] = c
}
for _, cookie := range cookies {
// TODO: add support for deleting cookies from the jar which are deleted in the browser
c := playwrightCookieToCookie(cookie)
if prev, ok := existingMap[cookieKey{c.Name, c.Path}]; ok {
// Preserve the original security attributes; only update
// Value and Expires which are the fields that legitimately
// change during navigation.
c.Secure = prev.Secure
c.HttpOnly = prev.HttpOnly
c.SameSite = prev.SameSite
}
if err = b.cookieJar.Set(c); err != nil {
return fmt.Errorf("error setting cookie in cookie jar: %w", err)
}
}
}
return nil
}
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
page, err := b.ctx.NewPage()
if err != nil {
return nil, err
}
pwOpts := playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateLoad,
}
if b.timeout > 0 {
var ms = float64(b.timeout.Milliseconds())
pwOpts.Timeout = &ms
}
if opts.Referer != "" {
pwOpts.Referer = playwright.String(opts.Referer)
}
resp, err := page.Goto(target, pwOpts)
if err != nil {
_ = page.Close()
return nil, err
}
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
if resp.Status() < 200 || resp.Status() >= 300 {
_ = page.Close()
if resp.Status() == 404 {
return nil, ErrPageNotFound
}
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
}
return page, nil
}
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
page, err := b.openPage(ctx, url, opts)
if err != nil {
return nil, err
}
err = b.updateCookies(ctx, page)
if err != nil {
return nil, err
}
return newDocument(b.pw, b.browser, page)
}
func (b playWrightBrowser) Close() error {
return errors.Join(
b.ctx.Close(),
b.browser.Close(),
b.pw.Stop(),
)
}
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
browser, err := NewBrowser(ctx, BrowserOptions{
Timeout: &timeout,
})
if err != nil {
return nil, fmt.Errorf("error creating browser: %w", err)
}
defer DeferClose(browser)
doc, err := browser.Open(ctx, target, OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("error opening page: %w", err)
}
defer DeferClose(doc)
return doc.Screenshot()
}