Files
go-extractor/sites/archive/archive.go
T
steve 3b38637e56
CI / test (push) Successful in 2m6s
CI / vet (push) Successful in 1m21s
CI / build (push) Successful in 2m13s
feat(archive): keep page open on captcha-status errors so callers can promote
Adds OpenPageOptions.AllowNonOKStatus. When set, openPage no longer closes
the page on non-2xx (other than 404) and Open returns both a usable Document
and ErrInvalidStatusCode. archive.IsArchived and Archive opt in, so callers
can PromoteToInteractive the captcha page, hand it to a human solver, and
demote back to extract content from the same browser instance — avoiding
the cf_clearance fingerprint-binding issue that re-challenges any fresh
retry browser.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-28 00:29:39 +00:00

192 lines
5.2 KiB
Go

package archive
import (
"context"
"errors"
"fmt"
"log/slog"
"net/url"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := 1 * time.Hour
c.Timeout = &def
}
if c.Endpoint == "" {
c.Endpoint = "https://archive.ph"
}
return c
}
var DefaultConfig = Config{}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil {
_ = doc.Close()
}
return nil, nil
}
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it to an InteractiveBrowser and let a human solve
// the challenge. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return doc, nil
}
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil {
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
urlInput := doc.SelectFirst("input[name='url']")
if urlInput == nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to find url input element")
}
err = urlInput.Type(u.String())
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err)
}
submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]")
if submitBtn == nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to find submit button")
}
err = submitBtn.Click()
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err)
}
// wait for the page to load, but respect context cancellation
select {
case <-ctx.Done():
slog.Debug("context done during initial wait", "err", ctx.Err())
_ = doc.Close()
return nil, ctx.Err()
case <-time.After(5 * time.Second):
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-ticker.C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
}
slog.Debug("checking url", "url", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
}
return doc, doc.WaitForNetworkIdle(nil)
}
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}