Files
go-extractor/browser_init.go
T
steve c3be14095a
CI / test (push) Successful in 2m16s
CI / build (push) Successful in 2m25s
CI / vet (push) Successful in 2m16s
feat: switch stealth Chromium default channel to consumer Chrome
Playwright's bundled Chromium has a distinct build fingerprint (build ID,
uniform WebGL/codec lists, HeadlessChrome residue) that anti-bot services
increasingly flag. Driving a system-installed Google Chrome via Playwright's
channel option sheds that signal and aligns sec-ch-ua with UA more cleanly.

Changes:
- Add BrowserOptions.Channel string field (chrome, chrome-beta, chromium,
  msedge; empty = default).
- When stealth+headless+Chromium and Channel is empty, default to "chrome"
  (was "chromium"). Explicit Channel values always win, so callers can opt
  back to "chromium" or pick another channel.
- Merge Channel in mergeOptions.
- Expose --channel/--ch flag on cmd/browser for A/B fingerprint testing.

Callers must have the chosen browser installed on the host
(e.g. `playwright install chrome`). Firefox and WebKit paths are untouched.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 17:26:50 +00:00

234 lines
6.3 KiB
Go

package extractor
import (
"fmt"
"log/slog"
"os"
"github.com/playwright-community/playwright-go"
)
// browserInitResult holds the result of shared browser initialization.
type browserInitResult struct {
pw *playwright.Playwright
browser playwright.Browser
bctx playwright.BrowserContext
opt BrowserOptions
}
// initBrowser performs the shared browser initialization steps:
// start Playwright, select browser type, connect or launch, create context, load cookies.
func initBrowser(opt BrowserOptions) (*browserInitResult, error) {
pw, err := playwright.Run()
if err != nil {
err = playwright.Install()
if err != nil {
return nil, fmt.Errorf("failed to install playwright: %w", err)
}
pw, err = playwright.Run()
if err != nil {
return nil, fmt.Errorf("failed to start playwright: %w", err)
}
}
var bt playwright.BrowserType
switch opt.Browser {
case BrowserChromium:
bt = pw.Chromium
if opt.ServerAddress == "" {
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM")
}
case BrowserFirefox:
bt = pw.Firefox
if opt.ServerAddress == "" {
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_FIREFOX")
}
case BrowserWebKit:
bt = pw.WebKit
if opt.ServerAddress == "" {
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_WEBKIT")
}
default:
return nil, ErrInvalidBrowserSelection
}
// User-Agent auto-selection is deferred until after browser launch so we
// can read the real browser version and build a UA that matches the
// sec-ch-ua header Chromium sends automatically. A mismatched version
// (e.g. UA says Chrome/131 while sec-ch-ua says Chromium/136) is a
// well-known bot-detection signal that causes 403s on many sites.
// Collect launch args and init scripts, starting with any stealth-mode presets.
stealth := opt.Stealth == nil || *opt.Stealth
var launchArgs []string
var initScripts []string
if stealth {
if opt.Browser == BrowserChromium {
launchArgs = append(launchArgs, stealthChromiumArgs...)
}
initScripts = append(initScripts, stealthCommonScripts...)
switch opt.Browser {
case BrowserChromium:
initScripts = append(initScripts, buildChromiumStealthScripts(randomChromiumProfile())...)
case BrowserFirefox:
initScripts = append(initScripts, buildFirefoxStealthScripts(randomFirefoxProfile())...)
}
}
launchArgs = append(launchArgs, opt.LaunchArgs...)
initScripts = append(initScripts, opt.InitScripts...)
var browser playwright.Browser
launch := true
if opt.ServerAddress != "" && !opt.UseLocalOnly {
launch = false
slog.Info("connecting to playwright server", "address", opt.ServerAddress)
var timeout float64 = 30000
browser, err = bt.Connect(opt.ServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
if err != nil {
if opt.RequireServer {
return nil, err
}
slog.Warn("failed to connect to playwright server, launching local browser", "err", err)
launch = true
}
}
if launch {
headless := opt.ShowBrowser == nil || !*opt.ShowBrowser
launchOpts := playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(headless),
}
if len(launchArgs) > 0 {
launchOpts.Args = launchArgs
}
if opt.Browser == BrowserChromium {
channel := opt.Channel
if channel == "" && stealth && headless {
// Real Chrome sheds Playwright's bundled-Chromium fingerprint
// (build ID, uniform WebGL, HeadlessChrome residue) that
// anti-bot services increasingly flag.
channel = "chrome"
}
if channel != "" {
launchOpts.Channel = playwright.String(channel)
}
}
browser, err = bt.Launch(launchOpts)
if err != nil {
return nil, fmt.Errorf("failed to launch browser: %w", err)
}
}
// Auto-select User-Agent now that we know the real browser version.
if opt.UserAgent == "" {
switch opt.Browser {
case BrowserChromium:
opt.UserAgent = chromiumUserAgent(browser.Version())
default:
opt.UserAgent = DefaultFirefoxUserAgent
}
}
var viewport *playwright.Size
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
viewport = &playwright.Size{
Width: opt.Dimensions.Width,
Height: opt.Dimensions.Height,
}
}
var scheme *playwright.ColorScheme
if opt.DarkMode {
scheme = playwright.ColorSchemeDark
} else {
scheme = playwright.ColorSchemeNoPreference
}
bctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
UserAgent: playwright.String(opt.UserAgent),
Viewport: viewport,
ColorScheme: scheme,
})
if err != nil {
return nil, fmt.Errorf("failed to create browser context: %w", err)
}
for _, script := range initScripts {
if err := bctx.AddInitScript(playwright.Script{Content: &script}); err != nil {
return nil, fmt.Errorf("failed to add init script: %w", err)
}
}
if opt.CookieJar != nil {
cookies, err := opt.CookieJar.GetAll()
if err != nil {
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
}
for _, c := range cookies {
oc := cookieToPlaywrightOptionalCookie(c)
if err := bctx.AddCookies([]playwright.OptionalCookie{oc}); err != nil {
slog.Warn("skipping invalid cookie", "name", c.Name, "host", c.Host, "error", err)
}
}
}
return &browserInitResult{
pw: pw,
browser: browser,
bctx: bctx,
opt: opt,
}, nil
}
// mergeOptions merges variadic BrowserOptions into a base set of defaults.
func mergeOptions(base BrowserOptions, opts []BrowserOptions) BrowserOptions {
for _, o := range opts {
if o.UserAgent != "" {
base.UserAgent = o.UserAgent
}
if o.Browser != "" {
base.Browser = o.Browser
}
if o.Timeout != nil {
base.Timeout = o.Timeout
}
if o.CookieJar != nil {
base.CookieJar = o.CookieJar
}
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
base.Dimensions = o.Dimensions
}
if o.DarkMode {
base.DarkMode = true
}
if o.ServerAddress != "" {
base.ServerAddress = o.ServerAddress
}
if o.RequireServer {
base.RequireServer = true
}
if o.UseLocalOnly {
base.UseLocalOnly = true
}
if o.ShowBrowser != nil {
base.ShowBrowser = o.ShowBrowser
}
if len(o.LaunchArgs) > 0 {
base.LaunchArgs = append(base.LaunchArgs, o.LaunchArgs...)
}
if len(o.InitScripts) > 0 {
base.InitScripts = append(base.InitScripts, o.InitScripts...)
}
if o.Stealth != nil {
base.Stealth = o.Stealth
}
if o.Channel != "" {
base.Channel = o.Channel
}
}
return base
}