feat(archive): keep page open on captcha-status errors so callers can promote
CI / test (push) Successful in 2m6s
CI / vet (push) Successful in 1m21s
CI / build (push) Successful in 2m13s

Adds OpenPageOptions.AllowNonOKStatus. When set, openPage no longer closes
the page on non-2xx (other than 404) and Open returns both a usable Document
and ErrInvalidStatusCode. archive.IsArchived and Archive opt in, so callers
can PromoteToInteractive the captcha page, hand it to a human solver, and
demote back to extract content from the same browser instance — avoiding
the cf_clearance fingerprint-binding issue that re-challenges any fresh
retry browser.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 00:29:39 +00:00
parent 841f1ec2bf
commit 3b38637e56
3 changed files with 53 additions and 15 deletions
+8
View File
@@ -7,6 +7,14 @@ import (
type OpenPageOptions struct {
Referer string
// AllowNonOKStatus, when true, keeps the page open and returns a usable
// Document along with ErrInvalidStatusCode on non-2xx responses (other
// than 404, which is treated as ErrPageNotFound and still closes the
// page). This lets callers promote the page to an InteractiveBrowser
// to e.g. let a human solve a Cloudflare captcha that produced a 403,
// then resume extraction from the same browser instance.
AllowNonOKStatus bool
}
type Browser interface {
+24 -10
View File
@@ -293,13 +293,19 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
if resp.Status() < 200 || resp.Status() >= 300 {
_ = page.Close()
if resp.Status() == 404 {
_ = page.Close()
return nil, ErrPageNotFound
}
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
statusErr := fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
if !opts.AllowNonOKStatus {
_ = page.Close()
return nil, statusErr
}
// Caller asked to keep the page on non-2xx (e.g. to interact with a
// Cloudflare captcha page that returned 403). Return both.
return page, statusErr
}
return page, nil
@@ -307,17 +313,25 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
page, err := b.openPage(ctx, url, opts)
if err != nil {
return nil, err
page, openErr := b.openPage(ctx, url, opts)
if page == nil {
return nil, openErr
}
err = b.updateCookies(ctx, page)
if err != nil {
return nil, err
if cookieErr := b.updateCookies(ctx, page); cookieErr != nil {
_ = page.Close()
return nil, cookieErr
}
return newDocument(b.pw, b.browser, page)
doc, docErr := newDocument(b.pw, b.browser, page)
if docErr != nil {
_ = page.Close()
return nil, docErr
}
// openErr may be ErrInvalidStatusCode when AllowNonOKStatus was set; the
// page is still usable, so propagate both the doc and the error.
return doc, openErr
}
func (b playWrightBrowser) Close() error {
+21 -5
View File
@@ -57,15 +57,25 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil {
_ = doc.Close()
}
return nil, nil
}
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it to an InteractiveBrowser and let a human solve
// the challenge. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
if errors.Is(err, extractor.ErrPageNotFound) {
return nil, nil
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
@@ -95,9 +105,15 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}