package extractor import ( "context" "errors" "fmt" "log/slog" "time" "github.com/playwright-community/playwright-go" ) type playWrightBrowser struct { browser playwright.Browser ctx playwright.BrowserContext userAgent string timeout time.Duration cookieJar CookieJar } var _ Browser = playWrightBrowser{} type PlayWrightBrowserSelection string var ( ErrInvalidBrowserSelection = errors.New("invalid browser selection") ErrInvalidStatusCode = errors.New("invalid status code") ) const ( PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium" PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox" PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" ) type PlayWrightBrowserOptions struct { UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" Browser PlayWrightBrowserSelection // If unset defaults to Firefox. Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // browser into the cookie jar for each request. CookieJar } func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { return playwright.OptionalCookie{ Name: cookie.Name, Value: cookie.Value, Domain: playwright.String(cookie.Domain), Path: playwright.String(cookie.Path), Expires: playwright.Float(float64(cookie.Expires.Unix())), HttpOnly: playwright.Bool(cookie.HttpOnly), } } func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { return Cookie{ Name: cookie.Name, Value: cookie.Value, Domain: cookie.Domain, Path: cookie.Path, Expires: time.Unix(int64(cookie.Expires), 0), HttpOnly: cookie.HttpOnly, } } func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { var thirtySeconds = 30 * time.Second opt := PlayWrightBrowserOptions{ UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", Browser: PlayWrightBrowserSelectionFirefox, Timeout: &thirtySeconds, } for _, o := range opts { if o.UserAgent != "" { opt.UserAgent = o.UserAgent } if o.Browser != "" { opt.Browser = o.Browser } if o.Timeout != nil { opt.Timeout = o.Timeout } if o.CookieJar != nil { opt.CookieJar = o.CookieJar } } err := playwright.Install() if err != nil { return nil, err } pw, err := playwright.Run() if err != nil { return nil, err } var bt playwright.BrowserType switch opt.Browser { case PlayWrightBrowserSelectionChromium: bt = pw.Chromium case PlayWrightBrowserSelectionFirefox: bt = pw.Firefox case PlayWrightBrowserSelectionWebKit: bt = pw.WebKit default: return nil, ErrInvalidBrowserSelection } browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ Headless: playwright.Bool(true), }) if err != nil { return nil, err } c, err := browser.NewContext(playwright.BrowserNewContextOptions{ UserAgent: playwright.String(opt.UserAgent), }) if err != nil { return nil, err } if opt.CookieJar != nil { cookies, err := opt.CookieJar.GetAll() if err != nil { return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err) } pwCookies := make([]playwright.OptionalCookie, len(cookies)) for i, cookie := range cookies { pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie) } err = c.AddCookies(pwCookies) if err != nil { return nil, fmt.Errorf("error adding cookies to browser: %w", err) } } return playWrightBrowser{ browser: browser, userAgent: opt.UserAgent, timeout: *opt.Timeout, cookieJar: opt.CookieJar, ctx: c, }, nil } func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error { if b.cookieJar != nil { cookies, err := page.Context().Cookies(page.URL()) for _, cookie := range cookies { // TODO: add support for deleting cookies from the jar which are deleted in the browser err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) if err != nil { return fmt.Errorf("error setting cookie in cookie jar: %w", err) } } } return nil } func (b playWrightBrowser) openPage(_ context.Context, target string) (playwright.Page, error) { page, err := b.ctx.NewPage() if err != nil { return nil, err } opts := playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateLoad, } if b.timeout > 0 { var ms = float64(b.timeout.Milliseconds()) opts.Timeout = &ms } resp, err := page.Goto(target, opts) if err != nil { return nil, err } slog.Info("opened page", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() != 200 { slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) } return page, nil } func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) { page, err := b.openPage(ctx, url) if err != nil { return nil, err } defer page.Close() text, err := page.Content() if err != nil { return nil, err } err = b.updateCookies(ctx, page) if err != nil { return nil, err } return source{ sourceUrl: url, content: text, }, nil } func (b playWrightBrowser) getScreenshot(_ context.Context, page playwright.Page, opts ScreenshotOptions) ([]byte, error) { var pwOpts playwright.PageScreenshotOptions if opts.Style == "" { opts.Style = ScreenshotStyleFullPage } if opts.Style == ScreenshotStyleFullPage { pwOpts.FullPage = playwright.Bool(true) } else if opts.Style == ScreenshotStyleViewport { pwOpts.FullPage = playwright.Bool(false) if opts.Width > 0 || opts.Height > 0 { pwOpts.Clip = &playwright.Rect{ Width: float64(opts.Width), Height: float64(opts.Height), } } } return page.Screenshot(pwOpts) } func (b playWrightBrowser) Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) { page, err := b.openPage(ctx, url) if err != nil { return nil, err } defer page.Close() err = b.updateCookies(ctx, page) if err != nil { return nil, err } return b.getScreenshot(ctx, page, opts) } func (b playWrightBrowser) OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) { page, err := b.openPage(ctx, url) if err != nil { return nil, nil, err } defer page.Close() text, err := page.Content() if err != nil { return nil, nil, err } screenshot, err := b.getScreenshot(ctx, page, opts) if err != nil { return nil, nil, err } err = b.updateCookies(ctx, page) if err != nil { return nil, nil, err } return source{ sourceUrl: url, content: text, }, screenshot, nil } func (b playWrightBrowser) Close() error { return errors.Join( b.ctx.Close(), b.browser.Close(), ) }