diff --git a/browser.go b/browser.go index 06742ad..ba78b6a 100644 --- a/browser.go +++ b/browser.go @@ -7,6 +7,14 @@ import ( type OpenPageOptions struct { Referer string + + // AllowNonOKStatus, when true, keeps the page open and returns a usable + // Document along with ErrInvalidStatusCode on non-2xx responses (other + // than 404, which is treated as ErrPageNotFound and still closes the + // page). This lets callers promote the page to an InteractiveBrowser + // to e.g. let a human solve a Cloudflare captcha that produced a 403, + // then resume extraction from the same browser instance. + AllowNonOKStatus bool } type Browser interface { diff --git a/playwright.go b/playwright.go index fbdb382..ed3c2f3 100644 --- a/playwright.go +++ b/playwright.go @@ -293,13 +293,19 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() < 200 || resp.Status() >= 300 { - _ = page.Close() - if resp.Status() == 404 { + _ = page.Close() return nil, ErrPageNotFound } slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) - return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) + statusErr := fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) + if !opts.AllowNonOKStatus { + _ = page.Close() + return nil, statusErr + } + // Caller asked to keep the page on non-2xx (e.g. to interact with a + // Cloudflare captcha page that returned 403). Return both. + return page, statusErr } return page, nil @@ -307,17 +313,25 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) { - page, err := b.openPage(ctx, url, opts) - if err != nil { - return nil, err + page, openErr := b.openPage(ctx, url, opts) + if page == nil { + return nil, openErr } - err = b.updateCookies(ctx, page) - if err != nil { - return nil, err + if cookieErr := b.updateCookies(ctx, page); cookieErr != nil { + _ = page.Close() + return nil, cookieErr } - return newDocument(b.pw, b.browser, page) + doc, docErr := newDocument(b.pw, b.browser, page) + if docErr != nil { + _ = page.Close() + return nil, docErr + } + + // openErr may be ErrInvalidStatusCode when AllowNonOKStatus was set; the + // page is still usable, so propagate both the doc and the error. + return doc, openErr } func (b playWrightBrowser) Close() error { diff --git a/sites/archive/archive.go b/sites/archive/archive.go index 55b1c46..94e9fcc 100644 --- a/sites/archive/archive.go +++ b/sites/archive/archive.go @@ -57,15 +57,25 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint) - doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{}) + doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true}) if err != nil { + if errors.Is(err, extractor.ErrPageNotFound) { + if doc != nil { + _ = doc.Close() + } + return nil, nil + } + // On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare + // captcha) the page is kept open by AllowNonOKStatus so the caller + // can promote it to an InteractiveBrowser and let a human solve + // the challenge. Return both the doc and the wrapped error. + if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { + return doc, fmt.Errorf("failed to open url: %w", err) + } if doc != nil { _ = doc.Close() } - if errors.Is(err, extractor.ErrPageNotFound) { - return nil, nil - } return nil, fmt.Errorf("failed to open url: %w", err) } @@ -95,9 +105,15 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) return nil, fmt.Errorf("invalid endpoint: %w", err) } - doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{}) + doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true}) if err != nil { + // On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare + // captcha) the page is kept open by AllowNonOKStatus so the caller + // can promote it. Return both the doc and the wrapped error. + if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { + return doc, fmt.Errorf("failed to open url: %w", err) + } if doc != nil { _ = doc.Close() }