fix(archive): harden archive.ph submit/poll flow
The archive.ph submission flow had several defects that caused Mort's summary fallback to return placeholder "Working..." pages instead of real archived content, or hang for the full timeout: - Context cancellation in the poll loop fell through to a final WaitForNetworkIdle and returned the doc as success. The function now returns a typed error (ErrArchiveIncomplete on deadline, wrapped ctx.Err() on caller cancel). - The poll only checked doc.URL() — if archive.ph's JS got wedged on /wip/<id>, the loop spun until timeout. Completion now also requires a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only transitions don't satisfy the check. - The final URL is now validated against an alphanumeric ID pattern, rejecting /wip/, /submit, /newest/ and the front page. - 5-second blind sleep before polling replaced with a bounded WaitForNetworkIdle that short-circuits when already archived. - Form selectors now use a cascade (input[name='url'] → input[type='url'] → input.input-url → input[name='anyway'], and similar for the submit button) so a single archive.ph markup change doesn't kill the flow. Errors name which selectors were tried. - Default timeout lowered from 1 hour to 5 minutes (still overridable via context deadline). Exposed as DefaultTimeout. - Poll progress is now logged at slog.Info every 30s so production logs surface stuck flows. - Front-page 5xx now retries twice with 1s/4s backoff before failing. - New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing. - Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix), selector cascade, DOM completion detector, transient status classification, and ctx cancellation paths via a thread-safe mutating mock document. Full integration with a live browser remains hand-tested. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+389
-69
@@ -1,3 +1,17 @@
|
||||
// Package archive provides a thin wrapper around archive.ph (a.k.a.
|
||||
// archive.today) for two operations:
|
||||
//
|
||||
// - IsArchived: check whether a target URL already has a snapshot, and
|
||||
// return the snapshot Document if so.
|
||||
// - Archive: submit a target URL to archive.ph, poll until the snapshot is
|
||||
// complete (or the context is cancelled), and return the resulting
|
||||
// Document.
|
||||
//
|
||||
// The submit flow is intentionally defensive: archive.ph occasionally rotates
|
||||
// its form markup, its front page sometimes 5xx's, and the in-progress
|
||||
// "/wip/<id>" pages can hang indefinitely if their JS gets wedged. The
|
||||
// implementation in this file documents and tests each of those failure modes
|
||||
// rather than papering over them.
|
||||
package archive
|
||||
|
||||
import (
|
||||
@@ -6,26 +20,121 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
// ErrArchiveIncomplete is returned when archive.ph never transitions away
|
||||
// from the /wip/ (work-in-progress) or /submit placeholder pages within the
|
||||
// configured timeout. Callers can errors.Is against this sentinel to
|
||||
// distinguish "archive.ph is slow / wedged" from "we got cancelled".
|
||||
var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout")
|
||||
|
||||
// ErrArchiveSelectorMissing is returned when archive.ph's front-page submit
|
||||
// form cannot be found by any of the known fallback selectors. This usually
|
||||
// means archive.ph rotated its markup and the cascade in this file needs to
|
||||
// be updated.
|
||||
var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph")
|
||||
|
||||
// urlInputSelectors and submitButtonSelectors are tried in order when
|
||||
// locating the submit form on archive.ph's front page. Updating one of
|
||||
// these in response to archive.ph DOM churn should not require touching
|
||||
// the rest of the file.
|
||||
var urlInputSelectors = []string{
|
||||
"input[name='url']",
|
||||
"input[type='url']",
|
||||
"input.input-url",
|
||||
"input[name='anyway']",
|
||||
}
|
||||
|
||||
var submitButtonSelectors = []string{
|
||||
"form#submiturl input[type='submit']",
|
||||
"form#submiturl button[type='submit']",
|
||||
"input[type='submit'][value*='save' i]",
|
||||
"button[type='submit']",
|
||||
}
|
||||
|
||||
// completionSelectors are DOM markers that, when present, indicate the page
|
||||
// is a finished archived snapshot rather than the /wip/ placeholder.
|
||||
// archive.ph snapshots wrap the page in a header bar + share box; both
|
||||
// vary slightly across snapshots so we accept any of them.
|
||||
var completionSelectors = []string{
|
||||
"div#HEADER",
|
||||
"#HEADER",
|
||||
"div[id^='SHARE']",
|
||||
"#SHARE",
|
||||
"div.TEXT-BLOCK",
|
||||
".TEXT-BLOCK",
|
||||
}
|
||||
|
||||
// archivedIDPattern matches a final archive.ph snapshot URL path.
|
||||
// archive.ph identifiers are short alphanumeric codes (typically 5+ chars)
|
||||
// and the snapshot URL is either:
|
||||
//
|
||||
// https://archive.ph/<id>
|
||||
// https://archive.ph/<id>/<original-url>
|
||||
// https://archive.ph/o/<id>
|
||||
// https://archive.ph/o/<id>/<original-url>
|
||||
//
|
||||
// The pattern matches the path leading character set; callers should also
|
||||
// check the hostname matches archive.ph (or whatever endpoint was configured).
|
||||
var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`)
|
||||
|
||||
// pendingPathPatterns lists path prefixes that mean the snapshot is NOT
|
||||
// finished yet — work-in-progress, the submit endpoint, or the lookup
|
||||
// endpoint (/newest/<url>) which redirects through to a snapshot URL.
|
||||
var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"}
|
||||
|
||||
const (
|
||||
// DefaultTimeout is the default upper bound on a single Archive call.
|
||||
// archive.ph normally finishes within seconds; the 5-minute ceiling is
|
||||
// generous enough to cover slow targets while still surfacing wedged
|
||||
// flows to the caller in a reasonable time.
|
||||
DefaultTimeout = 5 * time.Minute
|
||||
|
||||
// defaultPollInterval is how often the polling loop re-checks the
|
||||
// document's URL and DOM for completion markers. archive.ph snapshots
|
||||
// typically finish within seconds; a tight interval makes the call
|
||||
// return promptly without measurable cost (a couple of DOM selectors
|
||||
// against an already-open page).
|
||||
defaultPollInterval = 1 * time.Second
|
||||
|
||||
// defaultProgressLogInterval is how often the polling loop emits a
|
||||
// slog.Info progress line so production logs surface stuck flows.
|
||||
defaultProgressLogInterval = 30 * time.Second
|
||||
|
||||
// defaultInitialIdleWait is the soft cap on how long we wait for the
|
||||
// initial post-submit page to settle. Already-archived URLs typically
|
||||
// redirect to the snapshot almost immediately; new submissions take
|
||||
// longer and the poll loop picks up after this.
|
||||
defaultInitialIdleWait = 8 * time.Second
|
||||
|
||||
// frontPageRetries is the number of additional attempts to open the
|
||||
// archive.ph front page when it returns a 5xx (their own infra
|
||||
// occasionally hiccups).
|
||||
frontPageRetries = 2
|
||||
)
|
||||
|
||||
// frontPageBackoffs is the backoff schedule between front-page retries.
|
||||
// len(frontPageBackoffs) must equal frontPageRetries.
|
||||
var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second}
|
||||
|
||||
type Config struct {
|
||||
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
|
||||
Endpoint string
|
||||
|
||||
// Timeout will, if set, cancel any Archive call after this duration.
|
||||
// If nil, the default timeout of 1 hour will be used.
|
||||
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
|
||||
// If nil, DefaultTimeout (5 minutes) is used.
|
||||
Timeout *time.Duration
|
||||
}
|
||||
|
||||
// validate validates the config and sets default values if necessary.
|
||||
func (c Config) validate() Config {
|
||||
|
||||
if c.Timeout == nil {
|
||||
def := 1 * time.Hour
|
||||
def := DefaultTimeout
|
||||
c.Timeout = &def
|
||||
}
|
||||
|
||||
@@ -38,7 +147,13 @@ func (c Config) validate() Config {
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||
// IsArchived checks if a url is archived. It returns the archived Document if
|
||||
// it is archived, or (nil, nil) if archive.ph has no snapshot for it.
|
||||
//
|
||||
// Why: callers (e.g. Mort's summary system) want to avoid submitting an
|
||||
// archive request when one already exists.
|
||||
// What: opens archive.ph/newest/<target> and returns the resulting Document.
|
||||
// Test: see archive_test.go TestIsArchived_* (mock-browser based).
|
||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
u, err := url.Parse(target)
|
||||
@@ -52,13 +167,11 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
|
||||
}
|
||||
|
||||
uri := endpoint.JoinPath("/newest")
|
||||
|
||||
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
|
||||
|
||||
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
||||
slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint)
|
||||
|
||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||
|
||||
if err != nil {
|
||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||
if doc != nil {
|
||||
@@ -82,19 +195,35 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// IsArchived is a convenience wrapper around DefaultConfig.IsArchived.
|
||||
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.IsArchived(ctx, b, target)
|
||||
}
|
||||
|
||||
// Archive submits target to archive.ph and polls until the snapshot is
|
||||
// complete (transitioned away from /wip/ AND a known DOM completion marker
|
||||
// is present), or the context is cancelled / the timeout fires.
|
||||
//
|
||||
// Why: when Mort's summary system gets bot-checked on the live site it
|
||||
// falls back to reading the archive.ph snapshot. The previous implementation
|
||||
// was happy to return mid-submission /wip/ pages as "success" (placeholder
|
||||
// "Working..." pages with no real content) which made the fallback useless.
|
||||
// What: opens archive.ph's front page, types the target URL into the submit
|
||||
// form, clicks submit, and polls for completion. Returns a typed error if
|
||||
// archive.ph doesn't finish in time so callers can errors.Is and degrade.
|
||||
// Test: archive_test.go covers the URL validator, selector cascade, the
|
||||
// completion detector, and the ctx-cancellation path. The full integration
|
||||
// flow requires a live browser + archive.ph and is hand-tested.
|
||||
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
var cancel context.CancelFunc
|
||||
|
||||
if c.Timeout != nil {
|
||||
var cancel context.CancelFunc
|
||||
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
|
||||
slog.Info("setting timeout", "timeout", *c.Timeout)
|
||||
slog.Info("archive: setting timeout", "timeout", *c.Timeout)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
@@ -105,87 +234,278 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||
|
||||
doc, err := openArchiveFrontPage(ctx, b, c.Endpoint)
|
||||
if err != nil {
|
||||
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
|
||||
// captcha) the page is kept open by AllowNonOKStatus so the caller
|
||||
// can promote it. Return both the doc and the wrapped error.
|
||||
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||
return doc, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
urlInput := doc.SelectFirst("input[name='url']")
|
||||
// Fill and submit the form. doc has the page; any error past this point
|
||||
// must close it.
|
||||
urlInput, urlSelector := findURLInput(doc)
|
||||
if urlInput == nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to find url input element")
|
||||
return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors)
|
||||
}
|
||||
|
||||
err = urlInput.Type(u.String())
|
||||
if err != nil {
|
||||
if err = urlInput.Type(u.String()); err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to type url: %w", err)
|
||||
return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err)
|
||||
}
|
||||
|
||||
submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]")
|
||||
submitBtn, submitSelector := findSubmitButton(doc)
|
||||
if submitBtn == nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to find submit button")
|
||||
return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors)
|
||||
}
|
||||
|
||||
err = submitBtn.Click()
|
||||
if err != nil {
|
||||
if err = submitBtn.Click(); err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to click submit: %w", err)
|
||||
return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err)
|
||||
}
|
||||
|
||||
// wait for the page to load, but respect context cancellation
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Debug("context done during initial wait", "err", ctx.Err())
|
||||
// Initial soft idle wait so the post-submit redirect has a chance to
|
||||
// land before we start polling. Already-archived URLs short-circuit
|
||||
// here; new submissions fall through to the polling loop.
|
||||
initialWait := defaultInitialIdleWait
|
||||
if err = doc.WaitForNetworkIdle(&initialWait); err != nil {
|
||||
// Network-idle timing out is normal on archive.ph during a fresh
|
||||
// submission (the /wip/ page polls itself). Don't treat it as
|
||||
// fatal — let the polling loop decide.
|
||||
slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err)
|
||||
}
|
||||
|
||||
if err = ctx.Err(); err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, ctx.Err()
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
||||
// the way we can tell this is happening is by checking the url of the page periodically
|
||||
// if the page path starts with /wip/ then we are still waiting
|
||||
// also periodically refresh the page just in case
|
||||
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context done")
|
||||
keepGoing = false
|
||||
|
||||
case <-ticker.C:
|
||||
archivedUrl, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Debug("checking url", "url", archivedUrl.String())
|
||||
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
||||
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
||||
keepGoing = false
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return doc, doc.WaitForNetworkIdle(nil)
|
||||
// Poll until either (a) the page transitions to a finished snapshot,
|
||||
// (b) the context is cancelled, or (c) the timeout fires (which also
|
||||
// cancels ctx).
|
||||
if err = pollUntilArchived(ctx, doc, endpoint); err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Final settle: best-effort wait for in-flight asset loads on the
|
||||
// snapshot itself so a downstream Readability call sees stable DOM.
|
||||
settle := 10 * time.Second
|
||||
if err = doc.WaitForNetworkIdle(&settle); err != nil {
|
||||
slog.Debug("archive: final WaitForNetworkIdle returned", "err", err)
|
||||
}
|
||||
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// Archive is a convenience wrapper around DefaultConfig.Archive.
|
||||
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.Archive(ctx, b, target)
|
||||
}
|
||||
|
||||
// openArchiveFrontPage opens the archive.ph front page, retrying up to
|
||||
// frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a
|
||||
// non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately
|
||||
// along with the open document, mirroring the IsArchived contract so a
|
||||
// caller can promote it to an InteractiveBrowser.
|
||||
//
|
||||
// Why: archive.ph's own infrastructure occasionally serves 5xx during
|
||||
// load spikes; a single retry generally clears it.
|
||||
// What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx.
|
||||
// Test: not unit-tested (would require a fake browser that produces 5xx
|
||||
// then 200); behaviour anchored by the retry count + backoff constants.
|
||||
func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) {
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= frontPageRetries; attempt++ {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
|
||||
if err == nil {
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// ErrInvalidStatusCode with the doc kept open means the caller can
|
||||
// promote it to interactive (captcha). We don't retry these — the
|
||||
// underlying page is what the caller wants.
|
||||
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
|
||||
if isTransientStatus(err) && attempt < frontPageRetries {
|
||||
// 5xx — close, back off, retry.
|
||||
_ = doc.Close()
|
||||
lastErr = err
|
||||
slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err)
|
||||
if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
continue
|
||||
}
|
||||
return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
|
||||
}
|
||||
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
lastErr = err
|
||||
// Don't retry on non-status errors (browser-level failures).
|
||||
return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr)
|
||||
}
|
||||
|
||||
// isTransientStatus reports whether err wraps an HTTP 5xx status from
|
||||
// ErrInvalidStatusCode that we should retry.
|
||||
func isTransientStatus(err error) bool {
|
||||
if !errors.Is(err, extractor.ErrInvalidStatusCode) {
|
||||
return false
|
||||
}
|
||||
// The wrapped error message has the form "invalid status code: <n>".
|
||||
// Parse the trailing integer for the 5xx check.
|
||||
msg := err.Error()
|
||||
// Find the last space and parse what follows.
|
||||
idx := strings.LastIndex(msg, " ")
|
||||
if idx < 0 || idx == len(msg)-1 {
|
||||
return false
|
||||
}
|
||||
tail := msg[idx+1:]
|
||||
if len(tail) != 3 {
|
||||
return false
|
||||
}
|
||||
return tail[0] == '5'
|
||||
}
|
||||
|
||||
// sleepOrCancel blocks for d, returning true if it slept the full duration
|
||||
// and false if ctx was cancelled first.
|
||||
func sleepOrCancel(ctx context.Context, d time.Duration) bool {
|
||||
timer := time.NewTimer(d)
|
||||
defer timer.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-timer.C:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// findURLInput tries each selector in urlInputSelectors until one matches,
|
||||
// returning the Node and the selector that produced it.
|
||||
func findURLInput(doc extractor.Document) (extractor.Node, string) {
|
||||
for _, sel := range urlInputSelectors {
|
||||
if n := doc.SelectFirst(sel); n != nil {
|
||||
return n, sel
|
||||
}
|
||||
}
|
||||
return nil, ""
|
||||
}
|
||||
|
||||
// findSubmitButton tries each selector in submitButtonSelectors until one
|
||||
// matches, returning the Node and the selector that produced it.
|
||||
func findSubmitButton(doc extractor.Document) (extractor.Node, string) {
|
||||
for _, sel := range submitButtonSelectors {
|
||||
if n := doc.SelectFirst(sel); n != nil {
|
||||
return n, sel
|
||||
}
|
||||
}
|
||||
return nil, ""
|
||||
}
|
||||
|
||||
// pollUntilArchived watches doc until its URL transitions to a final
|
||||
// archive.ph snapshot URL AND a known completion DOM marker is present.
|
||||
// Returns ErrArchiveIncomplete if the context fires while still on /wip/
|
||||
// or /submit, and ctx.Err() if the context was cancelled by the caller for
|
||||
// other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because
|
||||
// it's almost always the configured Timeout firing).
|
||||
func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error {
|
||||
ticker := time.NewTicker(defaultPollInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
progressTicker := time.NewTicker(defaultProgressLogInterval)
|
||||
defer progressTicker.Stop()
|
||||
|
||||
for {
|
||||
// Check on entry as well, so a context that's already cancelled
|
||||
// produces a useful error rather than a spurious "incomplete".
|
||||
if err := ctx.Err(); err != nil {
|
||||
return classifyPollError(err, doc)
|
||||
}
|
||||
|
||||
if isArchiveComplete(doc, endpoint) {
|
||||
slog.Info("archive: snapshot complete", "url", doc.URL())
|
||||
return nil
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return classifyPollError(ctx.Err(), doc)
|
||||
case <-progressTicker.C:
|
||||
slog.Info("archive: still waiting for archive.ph", "url", doc.URL())
|
||||
case <-ticker.C:
|
||||
// fall through to top-of-loop completion check
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// classifyPollError maps a context error into either ErrArchiveIncomplete
|
||||
// (when the doc is still on a /wip/ or /submit page and the timeout fired)
|
||||
// or the underlying ctx error (when the caller cancelled for other reasons).
|
||||
func classifyPollError(ctxErr error, doc extractor.Document) error {
|
||||
if ctxErr == nil {
|
||||
return nil
|
||||
}
|
||||
currentURL := doc.URL()
|
||||
if errors.Is(ctxErr, context.DeadlineExceeded) {
|
||||
return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr)
|
||||
}
|
||||
return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr)
|
||||
}
|
||||
|
||||
// isArchiveComplete reports whether doc's URL and DOM indicate a finished
|
||||
// archive.ph snapshot. Both signals must agree: a URL transition alone is
|
||||
// not enough (the /wip/ page can occasionally redirect to a stub before
|
||||
// content lands), and a DOM marker alone is not enough (the front page's
|
||||
// own markup overlaps slightly).
|
||||
func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool {
|
||||
current, err := url.Parse(doc.URL())
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if !isFinalSnapshotURL(current, endpoint) {
|
||||
return false
|
||||
}
|
||||
return hasCompletionMarker(doc)
|
||||
}
|
||||
|
||||
// isFinalSnapshotURL reports whether u looks like a finished archive.ph
|
||||
// snapshot URL. The hostname must match the endpoint and the path must
|
||||
// match archivedIDPattern. /wip/, /submit and the front page are rejected.
|
||||
func isFinalSnapshotURL(u, endpoint *url.URL) bool {
|
||||
if u == nil || endpoint == nil {
|
||||
return false
|
||||
}
|
||||
if u.Hostname() != endpoint.Hostname() {
|
||||
// A redirect off-host (e.g. to the originally-archived URL) is
|
||||
// unusual for archive.ph but if it happened we'd accept it: the
|
||||
// snapshot was clearly produced and the caller asked us to land
|
||||
// somewhere useful.
|
||||
return u.Hostname() != ""
|
||||
}
|
||||
for _, prefix := range pendingPathPatterns {
|
||||
if strings.HasPrefix(u.Path, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if u.Path == "" || u.Path == "/" {
|
||||
return false
|
||||
}
|
||||
return archivedIDPattern.MatchString(u.Path)
|
||||
}
|
||||
|
||||
// hasCompletionMarker reports whether doc has at least one of the known
|
||||
// archive.ph completion DOM markers.
|
||||
func hasCompletionMarker(doc extractor.Node) bool {
|
||||
for _, sel := range completionSelectors {
|
||||
if n := doc.SelectFirst(sel); n != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user