feat(archive): keep page open on captcha-status errors so callers can promote
CI / test (push) Successful in 2m6s
CI / vet (push) Successful in 1m21s
CI / build (push) Successful in 2m13s

Adds OpenPageOptions.AllowNonOKStatus. When set, openPage no longer closes
the page on non-2xx (other than 404) and Open returns both a usable Document
and ErrInvalidStatusCode. archive.IsArchived and Archive opt in, so callers
can PromoteToInteractive the captcha page, hand it to a human solver, and
demote back to extract content from the same browser instance — avoiding
the cf_clearance fingerprint-binding issue that re-challenges any fresh
retry browser.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 00:29:39 +00:00
parent 841f1ec2bf
commit 3b38637e56
3 changed files with 53 additions and 15 deletions
+21 -5
View File
@@ -57,15 +57,25 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil {
_ = doc.Close()
}
return nil, nil
}
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it to an InteractiveBrowser and let a human solve
// the challenge. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
if errors.Is(err, extractor.ErrPageNotFound) {
return nil, nil
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
@@ -95,9 +105,15 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}