// Package archive provides a thin wrapper around archive.ph (a.k.a. // archive.today) for two operations: // // - IsArchived: check whether a target URL already has a snapshot, and // return the snapshot Document if so. // - Archive: submit a target URL to archive.ph, poll until the snapshot is // complete (or the context is cancelled), and return the resulting // Document. // // The submit flow is intentionally defensive: archive.ph occasionally rotates // its form markup, its front page sometimes 5xx's, and the in-progress // "/wip/" pages can hang indefinitely if their JS gets wedged. The // implementation in this file documents and tests each of those failure modes // rather than papering over them. package archive import ( "context" "errors" "fmt" "log/slog" "net/url" "regexp" "strings" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" ) // ErrArchiveIncomplete is returned when archive.ph never transitions away // from the /wip/ (work-in-progress) or /submit placeholder pages within the // configured timeout. Callers can errors.Is against this sentinel to // distinguish "archive.ph is slow / wedged" from "we got cancelled". var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout") // ErrArchiveSelectorMissing is returned when archive.ph's front-page submit // form cannot be found by any of the known fallback selectors. This usually // means archive.ph rotated its markup and the cascade in this file needs to // be updated. var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph") // urlInputSelectors and submitButtonSelectors are tried in order when // locating the submit form on archive.ph's front page. Updating one of // these in response to archive.ph DOM churn should not require touching // the rest of the file. var urlInputSelectors = []string{ "input[name='url']", "input[type='url']", "input.input-url", "input[name='anyway']", } var submitButtonSelectors = []string{ "form#submiturl input[type='submit']", "form#submiturl button[type='submit']", "input[type='submit'][value*='save' i]", "button[type='submit']", } // completionSelectors are DOM markers that, when present, indicate the page // is a finished archived snapshot rather than the /wip/ placeholder. // archive.ph snapshots wrap the page in a header bar + share box; both // vary slightly across snapshots so we accept any of them. var completionSelectors = []string{ "div#HEADER", "#HEADER", "div[id^='SHARE']", "#SHARE", "div.TEXT-BLOCK", ".TEXT-BLOCK", } // archivedIDPattern matches a final archive.ph snapshot URL path. // archive.ph identifiers are short alphanumeric codes (typically 5+ chars) // and the snapshot URL is either: // // https://archive.ph/ // https://archive.ph// // https://archive.ph/o/ // https://archive.ph/o// // // The pattern matches the path leading character set; callers should also // check the hostname matches archive.ph (or whatever endpoint was configured). var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`) // pendingPathPatterns lists path prefixes that mean the snapshot is NOT // finished yet — work-in-progress, the submit endpoint, or the lookup // endpoint (/newest/) which redirects through to a snapshot URL. var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"} const ( // DefaultTimeout is the default upper bound on a single Archive call. // archive.ph normally finishes within seconds; the 5-minute ceiling is // generous enough to cover slow targets while still surfacing wedged // flows to the caller in a reasonable time. DefaultTimeout = 5 * time.Minute // defaultPollInterval is how often the polling loop re-checks the // document's URL and DOM for completion markers. archive.ph snapshots // typically finish within seconds; a tight interval makes the call // return promptly without measurable cost (a couple of DOM selectors // against an already-open page). defaultPollInterval = 1 * time.Second // defaultProgressLogInterval is how often the polling loop emits a // slog.Info progress line so production logs surface stuck flows. defaultProgressLogInterval = 30 * time.Second // defaultInitialIdleWait is the soft cap on how long we wait for the // initial post-submit page to settle. Already-archived URLs typically // redirect to the snapshot almost immediately; new submissions take // longer and the poll loop picks up after this. defaultInitialIdleWait = 8 * time.Second // frontPageRetries is the number of additional attempts to open the // archive.ph front page when it returns a 5xx (their own infra // occasionally hiccups). frontPageRetries = 2 ) // frontPageBackoffs is the backoff schedule between front-page retries. // len(frontPageBackoffs) must equal frontPageRetries. var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second} type Config struct { // Endpoint is the archive endpoint to use. If empty, archive.ph will be used. Endpoint string // Timeout will, if set, cancel any Archive call after this duration. // If nil, DefaultTimeout (5 minutes) is used. Timeout *time.Duration } // validate validates the config and sets default values if necessary. func (c Config) validate() Config { if c.Timeout == nil { def := DefaultTimeout c.Timeout = &def } if c.Endpoint == "" { c.Endpoint = "https://archive.ph" } return c } var DefaultConfig = Config{} // IsArchived checks if a url is archived. It returns the archived Document if // it is archived, or (nil, nil) if archive.ph has no snapshot for it. // // Why: callers (e.g. Mort's summary system) want to avoid submitting an // archive request when one already exists. // What: opens archive.ph/newest/ and returns the resulting Document. // Test: see archive_test.go TestIsArchived_* (mock-browser based). func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() u, err := url.Parse(target) if err != nil { return nil, fmt.Errorf("invalid url: %w", err) } endpoint, err := url.Parse(c.Endpoint) if err != nil { return nil, fmt.Errorf("invalid endpoint: %w", err) } uri := endpoint.JoinPath("/newest") uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String() slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint) doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true}) if err != nil { if errors.Is(err, extractor.ErrPageNotFound) { if doc != nil { _ = doc.Close() } return nil, nil } // On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare // captcha) the page is kept open by AllowNonOKStatus so the caller // can promote it to an InteractiveBrowser and let a human solve // the challenge. Return both the doc and the wrapped error. if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { return doc, fmt.Errorf("failed to open url: %w", err) } if doc != nil { _ = doc.Close() } return nil, fmt.Errorf("failed to open url: %w", err) } return doc, nil } // IsArchived is a convenience wrapper around DefaultConfig.IsArchived. func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.IsArchived(ctx, b, target) } // Archive submits target to archive.ph and polls until the snapshot is // complete (transitioned away from /wip/ AND a known DOM completion marker // is present), or the context is cancelled / the timeout fires. // // Why: when Mort's summary system gets bot-checked on the live site it // falls back to reading the archive.ph snapshot. The previous implementation // was happy to return mid-submission /wip/ pages as "success" (placeholder // "Working..." pages with no real content) which made the fallback useless. // What: opens archive.ph's front page, types the target URL into the submit // form, clicks submit, and polls for completion. Returns a typed error if // archive.ph doesn't finish in time so callers can errors.Is and degrade. // Test: archive_test.go covers the URL validator, selector cascade, the // completion detector, and the ctx-cancellation path. The full integration // flow requires a live browser + archive.ph and is hand-tested. func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() if c.Timeout != nil { var cancel context.CancelFunc ctx, cancel = context.WithTimeout(ctx, *c.Timeout) slog.Info("archive: setting timeout", "timeout", *c.Timeout) defer cancel() } u, err := url.Parse(target) if err != nil { return nil, fmt.Errorf("invalid url: %w", err) } endpoint, err := url.Parse(c.Endpoint) if err != nil { return nil, fmt.Errorf("invalid endpoint: %w", err) } doc, err := openArchiveFrontPage(ctx, b, c.Endpoint) if err != nil { return nil, err } // Fill and submit the form. doc has the page; any error past this point // must close it. urlInput, urlSelector := findURLInput(doc) if urlInput == nil { _ = doc.Close() return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors) } if err = urlInput.Type(u.String()); err != nil { _ = doc.Close() return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err) } submitBtn, submitSelector := findSubmitButton(doc) if submitBtn == nil { _ = doc.Close() return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors) } if err = submitBtn.Click(); err != nil { _ = doc.Close() return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err) } // Initial soft idle wait so the post-submit redirect has a chance to // land before we start polling. Already-archived URLs short-circuit // here; new submissions fall through to the polling loop. initialWait := defaultInitialIdleWait if err = doc.WaitForNetworkIdle(&initialWait); err != nil { // Network-idle timing out is normal on archive.ph during a fresh // submission (the /wip/ page polls itself). Don't treat it as // fatal — let the polling loop decide. slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err) } if err = ctx.Err(); err != nil { _ = doc.Close() return nil, err } // Poll until either (a) the page transitions to a finished snapshot, // (b) the context is cancelled, or (c) the timeout fires (which also // cancels ctx). if err = pollUntilArchived(ctx, doc, endpoint); err != nil { _ = doc.Close() return nil, err } // Final settle: best-effort wait for in-flight asset loads on the // snapshot itself so a downstream Readability call sees stable DOM. settle := 10 * time.Second if err = doc.WaitForNetworkIdle(&settle); err != nil { slog.Debug("archive: final WaitForNetworkIdle returned", "err", err) } return doc, nil } // Archive is a convenience wrapper around DefaultConfig.Archive. func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.Archive(ctx, b, target) } // openArchiveFrontPage opens the archive.ph front page, retrying up to // frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a // non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately // along with the open document, mirroring the IsArchived contract so a // caller can promote it to an InteractiveBrowser. // // Why: archive.ph's own infrastructure occasionally serves 5xx during // load spikes; a single retry generally clears it. // What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx. // Test: not unit-tested (would require a fake browser that produces 5xx // then 200); behaviour anchored by the retry count + backoff constants. func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) { var lastErr error for attempt := 0; attempt <= frontPageRetries; attempt++ { if err := ctx.Err(); err != nil { return nil, err } doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true}) if err == nil { return doc, nil } // ErrInvalidStatusCode with the doc kept open means the caller can // promote it to interactive (captcha). We don't retry these — the // underlying page is what the caller wants. if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { if isTransientStatus(err) && attempt < frontPageRetries { // 5xx — close, back off, retry. _ = doc.Close() lastErr = err slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err) if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) { return nil, ctx.Err() } continue } return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err) } if doc != nil { _ = doc.Close() } lastErr = err // Don't retry on non-status errors (browser-level failures). return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err) } return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr) } // isTransientStatus reports whether err wraps an HTTP 5xx status from // ErrInvalidStatusCode that we should retry. func isTransientStatus(err error) bool { if !errors.Is(err, extractor.ErrInvalidStatusCode) { return false } // The wrapped error message has the form "invalid status code: ". // Parse the trailing integer for the 5xx check. msg := err.Error() // Find the last space and parse what follows. idx := strings.LastIndex(msg, " ") if idx < 0 || idx == len(msg)-1 { return false } tail := msg[idx+1:] if len(tail) != 3 { return false } return tail[0] == '5' } // sleepOrCancel blocks for d, returning true if it slept the full duration // and false if ctx was cancelled first. func sleepOrCancel(ctx context.Context, d time.Duration) bool { timer := time.NewTimer(d) defer timer.Stop() select { case <-ctx.Done(): return false case <-timer.C: return true } } // findURLInput tries each selector in urlInputSelectors until one matches, // returning the Node and the selector that produced it. func findURLInput(doc extractor.Document) (extractor.Node, string) { for _, sel := range urlInputSelectors { if n := doc.SelectFirst(sel); n != nil { return n, sel } } return nil, "" } // findSubmitButton tries each selector in submitButtonSelectors until one // matches, returning the Node and the selector that produced it. func findSubmitButton(doc extractor.Document) (extractor.Node, string) { for _, sel := range submitButtonSelectors { if n := doc.SelectFirst(sel); n != nil { return n, sel } } return nil, "" } // pollUntilArchived watches doc until its URL transitions to a final // archive.ph snapshot URL AND a known completion DOM marker is present. // Returns ErrArchiveIncomplete if the context fires while still on /wip/ // or /submit, and ctx.Err() if the context was cancelled by the caller for // other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because // it's almost always the configured Timeout firing). func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error { ticker := time.NewTicker(defaultPollInterval) defer ticker.Stop() progressTicker := time.NewTicker(defaultProgressLogInterval) defer progressTicker.Stop() for { // Check on entry as well, so a context that's already cancelled // produces a useful error rather than a spurious "incomplete". if err := ctx.Err(); err != nil { return classifyPollError(err, doc) } if isArchiveComplete(doc, endpoint) { slog.Info("archive: snapshot complete", "url", doc.URL()) return nil } select { case <-ctx.Done(): return classifyPollError(ctx.Err(), doc) case <-progressTicker.C: slog.Info("archive: still waiting for archive.ph", "url", doc.URL()) case <-ticker.C: // fall through to top-of-loop completion check } } } // classifyPollError maps a context error into either ErrArchiveIncomplete // (when the doc is still on a /wip/ or /submit page and the timeout fired) // or the underlying ctx error (when the caller cancelled for other reasons). func classifyPollError(ctxErr error, doc extractor.Document) error { if ctxErr == nil { return nil } currentURL := doc.URL() if errors.Is(ctxErr, context.DeadlineExceeded) { return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr) } return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr) } // isArchiveComplete reports whether doc's URL and DOM indicate a finished // archive.ph snapshot. Both signals must agree: a URL transition alone is // not enough (the /wip/ page can occasionally redirect to a stub before // content lands), and a DOM marker alone is not enough (the front page's // own markup overlaps slightly). func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool { current, err := url.Parse(doc.URL()) if err != nil { return false } if !isFinalSnapshotURL(current, endpoint) { return false } return hasCompletionMarker(doc) } // isFinalSnapshotURL reports whether u looks like a finished archive.ph // snapshot URL. The hostname must match the endpoint and the path must // match archivedIDPattern. /wip/, /submit and the front page are rejected. func isFinalSnapshotURL(u, endpoint *url.URL) bool { if u == nil || endpoint == nil { return false } if u.Hostname() != endpoint.Hostname() { // A redirect off-host (e.g. to the originally-archived URL) is // unusual for archive.ph but if it happened we'd accept it: the // snapshot was clearly produced and the caller asked us to land // somewhere useful. return u.Hostname() != "" } for _, prefix := range pendingPathPatterns { if strings.HasPrefix(u.Path, prefix) { return false } } if u.Path == "" || u.Path == "/" { return false } return archivedIDPattern.MatchString(u.Path) } // hasCompletionMarker reports whether doc has at least one of the known // archive.ph completion DOM markers. func hasCompletionMarker(doc extractor.Node) bool { for _, sel := range completionSelectors { if n := doc.SelectFirst(sel); n != nil { return true } } return false }