package extractor import ( "context" "errors" "fmt" "io" "log/slog" "time" "github.com/playwright-community/playwright-go" ) type playWrightBrowser struct { pw *playwright.Playwright browser playwright.Browser ctx playwright.BrowserContext userAgent string timeout time.Duration cookieJar CookieJar } var _ Browser = playWrightBrowser{} type PlayWrightBrowserSelection string var ( ErrInvalidBrowserSelection = errors.New("invalid browser selection") ErrPageNotFound = errors.New("page not found") ErrInvalidStatusCode = errors.New("invalid status code") ) const ( PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium" PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox" PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" ) type PlayWrightBrowserOptions struct { UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0" Browser PlayWrightBrowserSelection // If unset defaults to Firefox. Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // browser into the cookie jar for each request. CookieJar ShowBrowser bool // If false, browser will be headless } func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { return playwright.OptionalCookie{ Name: cookie.Name, Value: cookie.Value, Domain: playwright.String(cookie.Host), Path: playwright.String(cookie.Path), Expires: playwright.Float(float64(cookie.Expires.Unix())), HttpOnly: playwright.Bool(cookie.HttpOnly), } } func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { return Cookie{ Name: cookie.Name, Value: cookie.Value, Host: cookie.Domain, Path: cookie.Path, Expires: time.Unix(int64(cookie.Expires), 0), HttpOnly: cookie.HttpOnly, } } func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { var thirtySeconds = 30 * time.Second opt := PlayWrightBrowserOptions{ UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", Browser: PlayWrightBrowserSelectionFirefox, Timeout: &thirtySeconds, } for _, o := range opts { if o.UserAgent != "" { opt.UserAgent = o.UserAgent } if o.Browser != "" { opt.Browser = o.Browser } if o.Timeout != nil { opt.Timeout = o.Timeout } if o.CookieJar != nil { opt.CookieJar = o.CookieJar } opt.ShowBrowser = o.ShowBrowser } pw, err := playwright.Run() if err != nil { err = playwright.Install() if err != nil { return nil, err } pw, err = playwright.Run() if err != nil { return nil, err } } var bt playwright.BrowserType switch opt.Browser { case PlayWrightBrowserSelectionChromium: bt = pw.Chromium case PlayWrightBrowserSelectionFirefox: bt = pw.Firefox case PlayWrightBrowserSelectionWebKit: bt = pw.WebKit default: return nil, ErrInvalidBrowserSelection } browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ Headless: playwright.Bool(!opt.ShowBrowser), }) if err != nil { return nil, err } c, err := browser.NewContext(playwright.BrowserNewContextOptions{ UserAgent: playwright.String(opt.UserAgent), }) if err != nil { return nil, err } if opt.CookieJar != nil { cookies, err := opt.CookieJar.GetAll() if err != nil { return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err) } pwCookies := make([]playwright.OptionalCookie, len(cookies)) for i, cookie := range cookies { pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie) } err = c.AddCookies(pwCookies) if err != nil { return nil, fmt.Errorf("error adding cookies to browser: %w", err) } } return playWrightBrowser{ pw: pw, browser: browser, userAgent: opt.UserAgent, timeout: *opt.Timeout, cookieJar: opt.CookieJar, ctx: c, }, nil } func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error { if b.cookieJar != nil { cookies, err := page.Context().Cookies(page.URL()) for _, cookie := range cookies { // TODO: add support for deleting cookies from the jar which are deleted in the browser err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) if err != nil { return fmt.Errorf("error setting cookie in cookie jar: %w", err) } } } return nil } func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) { page, err := b.ctx.NewPage() if err != nil { return nil, err } pwOpts := playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateLoad, } if b.timeout > 0 { var ms = float64(b.timeout.Milliseconds()) pwOpts.Timeout = &ms } if opts.Referer != "" { pwOpts.Referer = playwright.String(opts.Referer) } resp, err := page.Goto(target, pwOpts) if err != nil { return nil, err } slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() != 200 { time.Sleep(999 * time.Hour * 24) time.Sleep(25 * time.Second) _ = page.Close() if resp.Status() == 404 { return nil, ErrPageNotFound } slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) } return page, nil } func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) { page, err := b.openPage(ctx, url, opts) if err != nil { return nil, err } err = b.updateCookies(ctx, page) if err != nil { return nil, err } return newDocument(b.pw, b.browser, page) } func (b playWrightBrowser) Close() error { return errors.Join( b.ctx.Close(), b.browser.Close(), ) } func deferClose(cl io.Closer) { _ = cl.Close() } func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) { browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{ Timeout: &timeout, }) if err != nil { return nil, fmt.Errorf("error creating browser: %w", err) } defer deferClose(browser) doc, err := browser.Open(ctx, target, OpenPageOptions{}) if err != nil { return nil, fmt.Errorf("error opening page: %w", err) } defer deferClose(doc) return doc.Screenshot() }