package extractor import ( "context" "errors" "fmt" "log/slog" "time" "github.com/playwright-community/playwright-go" ) type playWrightBrowser struct { browser playwright.Browser ctx playwright.BrowserContext userAgent string timeout time.Duration cookieJar CookieJar } var _ Browser = playWrightBrowser{} type PlayWrightBrowserSelection string var ( ErrInvalidBrowserSelection = errors.New("invalid browser selection") ErrInvalidStatusCode = errors.New("invalid status code") ) const ( PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium" PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox" PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" ) type PlayWrightBrowserOptions struct { UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" Browser PlayWrightBrowserSelection // If unset defaults to Chromium Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // browser into the cookie jar for each request. CookieJar } func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { return playwright.OptionalCookie{ Name: cookie.Name, Value: cookie.Value, Domain: playwright.String(cookie.Domain), Path: playwright.String(cookie.Path), Expires: playwright.Float(float64(cookie.Expires.Unix())), HttpOnly: playwright.Bool(cookie.HttpOnly), } } func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { return Cookie{ Name: cookie.Name, Value: cookie.Value, Domain: cookie.Domain, Path: cookie.Path, Expires: time.Unix(int64(cookie.Expires), 0), HttpOnly: cookie.HttpOnly, } } func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { var thirtySeconds = 30 * time.Second opt := PlayWrightBrowserOptions{ UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", Browser: PlayWrightBrowserSelectionChromium, Timeout: &thirtySeconds, } for _, o := range opts { if o.UserAgent != "" { opt.UserAgent = o.UserAgent } if o.Browser != "" { opt.Browser = o.Browser } if o.Timeout != nil { opt.Timeout = o.Timeout } if o.CookieJar != nil { opt.CookieJar = o.CookieJar } } err := playwright.Install() if err != nil { return nil, err } pw, err := playwright.Run() if err != nil { return nil, err } var bt playwright.BrowserType switch opt.Browser { case PlayWrightBrowserSelectionChromium: bt = pw.Chromium case PlayWrightBrowserSelectionFirefox: bt = pw.Firefox case PlayWrightBrowserSelectionWebKit: bt = pw.WebKit default: return nil, ErrInvalidBrowserSelection } browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ Headless: playwright.Bool(true), }) if err != nil { return nil, err } c, err := browser.NewContext(playwright.BrowserNewContextOptions{ UserAgent: playwright.String(opt.UserAgent), }) if err != nil { return nil, err } if opt.CookieJar != nil { cookies, err := opt.CookieJar.GetAll() if err != nil { return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err) } pwCookies := make([]playwright.OptionalCookie, len(cookies)) for i, cookie := range cookies { pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie) } err = c.AddCookies(pwCookies) if err != nil { return nil, fmt.Errorf("error adding cookies to browser: %w", err) } } return playWrightBrowser{ browser: browser, userAgent: opt.UserAgent, timeout: *opt.Timeout, cookieJar: opt.CookieJar, ctx: c, }, nil } func (b playWrightBrowser) Open(_ context.Context, url string) (Source, error) { if b.userAgent == "" { b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" } page, err := b.ctx.NewPage() if err != nil { return nil, err } opts := playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateLoad, } if b.timeout > 0 { var ms = float64(b.timeout.Milliseconds()) opts.Timeout = &ms } resp, err := page.Goto(url, opts) if err != nil { return nil, err } slog.Info("response", "response", resp) if resp.Status() != 200 { return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status) } text, err := resp.Text() if err != nil { return nil, err } if b.cookieJar != nil { cookies, err := page.Context().Cookies(page.URL()) for _, cookie := range cookies { // TODO: add support for deleting cookies from the jar which are deleted in the browser err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) if err != nil { return nil, fmt.Errorf("error setting cookie in cookie jar: %w", err) } } } return source{ sourceUrl: url, content: text, }, nil } func (b playWrightBrowser) Close() error { return errors.Join( b.ctx.Close(), b.browser.Close(), ) }