feat(archive): keep page open on captcha-status errors so callers can promote
Adds OpenPageOptions.AllowNonOKStatus. When set, openPage no longer closes the page on non-2xx (other than 404) and Open returns both a usable Document and ErrInvalidStatusCode. archive.IsArchived and Archive opt in, so callers can PromoteToInteractive the captcha page, hand it to a human solver, and demote back to extract content from the same browser instance — avoiding the cf_clearance fingerprint-binding issue that re-challenges any fresh retry browser. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,14 @@ import (
|
|||||||
|
|
||||||
type OpenPageOptions struct {
|
type OpenPageOptions struct {
|
||||||
Referer string
|
Referer string
|
||||||
|
|
||||||
|
// AllowNonOKStatus, when true, keeps the page open and returns a usable
|
||||||
|
// Document along with ErrInvalidStatusCode on non-2xx responses (other
|
||||||
|
// than 404, which is treated as ErrPageNotFound and still closes the
|
||||||
|
// page). This lets callers promote the page to an InteractiveBrowser
|
||||||
|
// to e.g. let a human solve a Cloudflare captcha that produced a 403,
|
||||||
|
// then resume extraction from the same browser instance.
|
||||||
|
AllowNonOKStatus bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type Browser interface {
|
type Browser interface {
|
||||||
|
|||||||
+24
-10
@@ -293,13 +293,19 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
|
|||||||
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||||
|
|
||||||
if resp.Status() < 200 || resp.Status() >= 300 {
|
if resp.Status() < 200 || resp.Status() >= 300 {
|
||||||
_ = page.Close()
|
|
||||||
|
|
||||||
if resp.Status() == 404 {
|
if resp.Status() == 404 {
|
||||||
|
_ = page.Close()
|
||||||
return nil, ErrPageNotFound
|
return nil, ErrPageNotFound
|
||||||
}
|
}
|
||||||
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
||||||
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
statusErr := fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
||||||
|
if !opts.AllowNonOKStatus {
|
||||||
|
_ = page.Close()
|
||||||
|
return nil, statusErr
|
||||||
|
}
|
||||||
|
// Caller asked to keep the page on non-2xx (e.g. to interact with a
|
||||||
|
// Cloudflare captcha page that returned 403). Return both.
|
||||||
|
return page, statusErr
|
||||||
}
|
}
|
||||||
|
|
||||||
return page, nil
|
return page, nil
|
||||||
@@ -307,17 +313,25 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
|
|||||||
|
|
||||||
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
||||||
|
|
||||||
page, err := b.openPage(ctx, url, opts)
|
page, openErr := b.openPage(ctx, url, opts)
|
||||||
if err != nil {
|
if page == nil {
|
||||||
return nil, err
|
return nil, openErr
|
||||||
}
|
}
|
||||||
|
|
||||||
err = b.updateCookies(ctx, page)
|
if cookieErr := b.updateCookies(ctx, page); cookieErr != nil {
|
||||||
if err != nil {
|
_ = page.Close()
|
||||||
return nil, err
|
return nil, cookieErr
|
||||||
}
|
}
|
||||||
|
|
||||||
return newDocument(b.pw, b.browser, page)
|
doc, docErr := newDocument(b.pw, b.browser, page)
|
||||||
|
if docErr != nil {
|
||||||
|
_ = page.Close()
|
||||||
|
return nil, docErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// openErr may be ErrInvalidStatusCode when AllowNonOKStatus was set; the
|
||||||
|
// page is still usable, so propagate both the doc and the error.
|
||||||
|
return doc, openErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b playWrightBrowser) Close() error {
|
func (b playWrightBrowser) Close() error {
|
||||||
|
|||||||
@@ -57,15 +57,25 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
|
|||||||
|
|
||||||
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
||||||
|
|
||||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||||
|
if doc != nil {
|
||||||
|
_ = doc.Close()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
|
||||||
|
// captcha) the page is kept open by AllowNonOKStatus so the caller
|
||||||
|
// can promote it to an InteractiveBrowser and let a human solve
|
||||||
|
// the challenge. Return both the doc and the wrapped error.
|
||||||
|
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||||
|
return doc, fmt.Errorf("failed to open url: %w", err)
|
||||||
|
}
|
||||||
if doc != nil {
|
if doc != nil {
|
||||||
_ = doc.Close()
|
_ = doc.Close()
|
||||||
}
|
}
|
||||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,9 +105,15 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
|||||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
|
||||||
|
// captcha) the page is kept open by AllowNonOKStatus so the caller
|
||||||
|
// can promote it. Return both the doc and the wrapped error.
|
||||||
|
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||||
|
return doc, fmt.Errorf("failed to open url: %w", err)
|
||||||
|
}
|
||||||
if doc != nil {
|
if doc != nil {
|
||||||
_ = doc.Close()
|
_ = doc.Close()
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user