feat(archive): keep page open on captcha-status errors so callers can promote
Adds OpenPageOptions.AllowNonOKStatus. When set, openPage no longer closes the page on non-2xx (other than 404) and Open returns both a usable Document and ErrInvalidStatusCode. archive.IsArchived and Archive opt in, so callers can PromoteToInteractive the captcha page, hand it to a human solver, and demote back to extract content from the same browser instance — avoiding the cf_clearance fingerprint-binding issue that re-challenges any fresh retry browser. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -57,15 +57,25 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
|
||||
|
||||
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
||||
|
||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
|
||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||
|
||||
if err != nil {
|
||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
|
||||
// captcha) the page is kept open by AllowNonOKStatus so the caller
|
||||
// can promote it to an InteractiveBrowser and let a human solve
|
||||
// the challenge. Return both the doc and the wrapped error.
|
||||
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||
return doc, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
@@ -95,9 +105,15 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
|
||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||
|
||||
if err != nil {
|
||||
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
|
||||
// captcha) the page is kept open by AllowNonOKStatus so the caller
|
||||
// can promote it. Return both the doc and the wrapped error.
|
||||
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||
return doc, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user