package extractor import ( "context" "errors" "fmt" "log/slog" "time" "github.com/playwright-community/playwright-go" ) type playWrightBrowser struct { pw *playwright.Playwright browser playwright.Browser ctx playwright.BrowserContext userAgent string timeout time.Duration cookieJar CookieJar serverAddr string } var _ Browser = playWrightBrowser{} type BrowserSelection string var ( ErrInvalidBrowserSelection = errors.New("invalid browser selection") ErrPageNotFound = errors.New("page not found") ErrInvalidStatusCode = errors.New("invalid status code") ) const ( BrowserChromium BrowserSelection = "chromium" BrowserFirefox BrowserSelection = "firefox" BrowserWebKit BrowserSelection = "webkit" ) type Size struct { Width int Height int } type BrowserOptions struct { UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0" Browser BrowserSelection // If unset defaults to Firefox. Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // browser into the cookie jar for each request. CookieJar ShowBrowser bool // If false, browser will be headless Dimensions Size DarkMode bool // ServerAddress is the address of a Playwright server to connect to. // Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS. ServerAddress string // RequireServer will, if set, return an error if the connection to the // Playwright server fails instead of falling back to a local browser launch. RequireServer bool // UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser. UseLocalOnly bool } func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { return playwright.OptionalCookie{ Name: cookie.Name, Value: cookie.Value, Domain: playwright.String(cookie.Host), Path: playwright.String(cookie.Path), Expires: playwright.Float(float64(cookie.Expires.Unix())), HttpOnly: playwright.Bool(cookie.HttpOnly), } } func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { return Cookie{ Name: cookie.Name, Value: cookie.Value, Host: cookie.Domain, Path: cookie.Path, Expires: time.Unix(int64(cookie.Expires), 0), HttpOnly: cookie.HttpOnly, } } func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) { var thirtySeconds = 30 * time.Second opt := mergeOptions(BrowserOptions{ UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0", Browser: BrowserFirefox, Timeout: &thirtySeconds, }, opts) if err := ctx.Err(); err != nil { return nil, err } type browserResult struct { browser Browser err error } resultCh := make(chan browserResult, 1) go func() { res, err := initBrowser(opt) if err != nil { resultCh <- browserResult{nil, err} return } resultCh <- browserResult{ browser: playWrightBrowser{ pw: res.pw, browser: res.browser, userAgent: res.opt.UserAgent, timeout: *res.opt.Timeout, cookieJar: res.opt.CookieJar, ctx: res.bctx, serverAddr: res.opt.ServerAddress, }, } }() select { case <-ctx.Done(): return nil, ctx.Err() case result := <-resultCh: return result.browser, result.err } } func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error { if b.cookieJar != nil { cookies, err := page.Context().Cookies(page.URL()) for _, cookie := range cookies { // TODO: add support for deleting cookies from the jar which are deleted in the browser err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) if err != nil { return fmt.Errorf("error setting cookie in cookie jar: %w", err) } } } return nil } func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) { page, err := b.ctx.NewPage() if err != nil { return nil, err } pwOpts := playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateLoad, } if b.timeout > 0 { var ms = float64(b.timeout.Milliseconds()) pwOpts.Timeout = &ms } if opts.Referer != "" { pwOpts.Referer = playwright.String(opts.Referer) } resp, err := page.Goto(target, pwOpts) if err != nil { return nil, err } slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() < 200 || resp.Status() >= 300 { _ = page.Close() if resp.Status() == 404 { return nil, ErrPageNotFound } slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) } return page, nil } func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) { page, err := b.openPage(ctx, url, opts) if err != nil { return nil, err } err = b.updateCookies(ctx, page) if err != nil { return nil, err } return newDocument(b.pw, b.browser, page) } func (b playWrightBrowser) Close() error { return errors.Join( b.browser.Close(), b.ctx.Close(), b.pw.Stop(), ) } func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) { browser, err := NewBrowser(ctx, BrowserOptions{ Timeout: &timeout, }) if err != nil { return nil, fmt.Errorf("error creating browser: %w", err) } defer DeferClose(browser) doc, err := browser.Open(ctx, target, OpenPageOptions{}) if err != nil { return nil, fmt.Errorf("error opening page: %w", err) } defer DeferClose(doc) return doc.Screenshot() }