Files
go-extractor/sites/archive/archive.go
T
steve 45fa7c4e8f
CI / build (pull_request) Successful in 1m5s
CI / vet (pull_request) Successful in 1m26s
CI / test (pull_request) Successful in 1m27s
fix(archive): harden archive.ph submit/poll flow
The archive.ph submission flow had several defects that caused Mort's
summary fallback to return placeholder "Working..." pages instead of
real archived content, or hang for the full timeout:

- Context cancellation in the poll loop fell through to a final
  WaitForNetworkIdle and returned the doc as success. The function now
  returns a typed error (ErrArchiveIncomplete on deadline, wrapped
  ctx.Err() on caller cancel).
- The poll only checked doc.URL() — if archive.ph's JS got wedged on
  /wip/<id>, the loop spun until timeout. Completion now also requires
  a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only
  transitions don't satisfy the check.
- The final URL is now validated against an alphanumeric ID pattern,
  rejecting /wip/, /submit, /newest/ and the front page.
- 5-second blind sleep before polling replaced with a bounded
  WaitForNetworkIdle that short-circuits when already archived.
- Form selectors now use a cascade (input[name='url'] →
  input[type='url'] → input.input-url → input[name='anyway'], and
  similar for the submit button) so a single archive.ph markup change
  doesn't kill the flow. Errors name which selectors were tried.
- Default timeout lowered from 1 hour to 5 minutes (still overridable
  via context deadline). Exposed as DefaultTimeout.
- Poll progress is now logged at slog.Info every 30s so production logs
  surface stuck flows.
- Front-page 5xx now retries twice with 1s/4s backoff before failing.
- New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing.
- Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix),
  selector cascade, DOM completion detector, transient status
  classification, and ctx cancellation paths via a thread-safe mutating
  mock document. Full integration with a live browser remains hand-tested.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 17:23:24 -04:00

512 lines
18 KiB
Go

// Package archive provides a thin wrapper around archive.ph (a.k.a.
// archive.today) for two operations:
//
// - IsArchived: check whether a target URL already has a snapshot, and
// return the snapshot Document if so.
// - Archive: submit a target URL to archive.ph, poll until the snapshot is
// complete (or the context is cancelled), and return the resulting
// Document.
//
// The submit flow is intentionally defensive: archive.ph occasionally rotates
// its form markup, its front page sometimes 5xx's, and the in-progress
// "/wip/<id>" pages can hang indefinitely if their JS gets wedged. The
// implementation in this file documents and tests each of those failure modes
// rather than papering over them.
package archive
import (
"context"
"errors"
"fmt"
"log/slog"
"net/url"
"regexp"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
// ErrArchiveIncomplete is returned when archive.ph never transitions away
// from the /wip/ (work-in-progress) or /submit placeholder pages within the
// configured timeout. Callers can errors.Is against this sentinel to
// distinguish "archive.ph is slow / wedged" from "we got cancelled".
var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout")
// ErrArchiveSelectorMissing is returned when archive.ph's front-page submit
// form cannot be found by any of the known fallback selectors. This usually
// means archive.ph rotated its markup and the cascade in this file needs to
// be updated.
var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph")
// urlInputSelectors and submitButtonSelectors are tried in order when
// locating the submit form on archive.ph's front page. Updating one of
// these in response to archive.ph DOM churn should not require touching
// the rest of the file.
var urlInputSelectors = []string{
"input[name='url']",
"input[type='url']",
"input.input-url",
"input[name='anyway']",
}
var submitButtonSelectors = []string{
"form#submiturl input[type='submit']",
"form#submiturl button[type='submit']",
"input[type='submit'][value*='save' i]",
"button[type='submit']",
}
// completionSelectors are DOM markers that, when present, indicate the page
// is a finished archived snapshot rather than the /wip/ placeholder.
// archive.ph snapshots wrap the page in a header bar + share box; both
// vary slightly across snapshots so we accept any of them.
var completionSelectors = []string{
"div#HEADER",
"#HEADER",
"div[id^='SHARE']",
"#SHARE",
"div.TEXT-BLOCK",
".TEXT-BLOCK",
}
// archivedIDPattern matches a final archive.ph snapshot URL path.
// archive.ph identifiers are short alphanumeric codes (typically 5+ chars)
// and the snapshot URL is either:
//
// https://archive.ph/<id>
// https://archive.ph/<id>/<original-url>
// https://archive.ph/o/<id>
// https://archive.ph/o/<id>/<original-url>
//
// The pattern matches the path leading character set; callers should also
// check the hostname matches archive.ph (or whatever endpoint was configured).
var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`)
// pendingPathPatterns lists path prefixes that mean the snapshot is NOT
// finished yet — work-in-progress, the submit endpoint, or the lookup
// endpoint (/newest/<url>) which redirects through to a snapshot URL.
var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"}
const (
// DefaultTimeout is the default upper bound on a single Archive call.
// archive.ph normally finishes within seconds; the 5-minute ceiling is
// generous enough to cover slow targets while still surfacing wedged
// flows to the caller in a reasonable time.
DefaultTimeout = 5 * time.Minute
// defaultPollInterval is how often the polling loop re-checks the
// document's URL and DOM for completion markers. archive.ph snapshots
// typically finish within seconds; a tight interval makes the call
// return promptly without measurable cost (a couple of DOM selectors
// against an already-open page).
defaultPollInterval = 1 * time.Second
// defaultProgressLogInterval is how often the polling loop emits a
// slog.Info progress line so production logs surface stuck flows.
defaultProgressLogInterval = 30 * time.Second
// defaultInitialIdleWait is the soft cap on how long we wait for the
// initial post-submit page to settle. Already-archived URLs typically
// redirect to the snapshot almost immediately; new submissions take
// longer and the poll loop picks up after this.
defaultInitialIdleWait = 8 * time.Second
// frontPageRetries is the number of additional attempts to open the
// archive.ph front page when it returns a 5xx (their own infra
// occasionally hiccups).
frontPageRetries = 2
)
// frontPageBackoffs is the backoff schedule between front-page retries.
// len(frontPageBackoffs) must equal frontPageRetries.
var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second}
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, DefaultTimeout (5 minutes) is used.
Timeout *time.Duration
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := DefaultTimeout
c.Timeout = &def
}
if c.Endpoint == "" {
c.Endpoint = "https://archive.ph"
}
return c
}
var DefaultConfig = Config{}
// IsArchived checks if a url is archived. It returns the archived Document if
// it is archived, or (nil, nil) if archive.ph has no snapshot for it.
//
// Why: callers (e.g. Mort's summary system) want to avoid submitting an
// archive request when one already exists.
// What: opens archive.ph/newest/<target> and returns the resulting Document.
// Test: see archive_test.go TestIsArchived_* (mock-browser based).
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil {
_ = doc.Close()
}
return nil, nil
}
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it to an InteractiveBrowser and let a human solve
// the challenge. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return doc, nil
}
// IsArchived is a convenience wrapper around DefaultConfig.IsArchived.
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
// Archive submits target to archive.ph and polls until the snapshot is
// complete (transitioned away from /wip/ AND a known DOM completion marker
// is present), or the context is cancelled / the timeout fires.
//
// Why: when Mort's summary system gets bot-checked on the live site it
// falls back to reading the archive.ph snapshot. The previous implementation
// was happy to return mid-submission /wip/ pages as "success" (placeholder
// "Working..." pages with no real content) which made the fallback useless.
// What: opens archive.ph's front page, types the target URL into the submit
// form, clicks submit, and polls for completion. Returns a typed error if
// archive.ph doesn't finish in time so callers can errors.Is and degrade.
// Test: archive_test.go covers the URL validator, selector cascade, the
// completion detector, and the ctx-cancellation path. The full integration
// flow requires a live browser + archive.ph and is hand-tested.
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
if c.Timeout != nil {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("archive: setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := openArchiveFrontPage(ctx, b, c.Endpoint)
if err != nil {
return nil, err
}
// Fill and submit the form. doc has the page; any error past this point
// must close it.
urlInput, urlSelector := findURLInput(doc)
if urlInput == nil {
_ = doc.Close()
return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors)
}
if err = urlInput.Type(u.String()); err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err)
}
submitBtn, submitSelector := findSubmitButton(doc)
if submitBtn == nil {
_ = doc.Close()
return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors)
}
if err = submitBtn.Click(); err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err)
}
// Initial soft idle wait so the post-submit redirect has a chance to
// land before we start polling. Already-archived URLs short-circuit
// here; new submissions fall through to the polling loop.
initialWait := defaultInitialIdleWait
if err = doc.WaitForNetworkIdle(&initialWait); err != nil {
// Network-idle timing out is normal on archive.ph during a fresh
// submission (the /wip/ page polls itself). Don't treat it as
// fatal — let the polling loop decide.
slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err)
}
if err = ctx.Err(); err != nil {
_ = doc.Close()
return nil, err
}
// Poll until either (a) the page transitions to a finished snapshot,
// (b) the context is cancelled, or (c) the timeout fires (which also
// cancels ctx).
if err = pollUntilArchived(ctx, doc, endpoint); err != nil {
_ = doc.Close()
return nil, err
}
// Final settle: best-effort wait for in-flight asset loads on the
// snapshot itself so a downstream Readability call sees stable DOM.
settle := 10 * time.Second
if err = doc.WaitForNetworkIdle(&settle); err != nil {
slog.Debug("archive: final WaitForNetworkIdle returned", "err", err)
}
return doc, nil
}
// Archive is a convenience wrapper around DefaultConfig.Archive.
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}
// openArchiveFrontPage opens the archive.ph front page, retrying up to
// frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a
// non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately
// along with the open document, mirroring the IsArchived contract so a
// caller can promote it to an InteractiveBrowser.
//
// Why: archive.ph's own infrastructure occasionally serves 5xx during
// load spikes; a single retry generally clears it.
// What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx.
// Test: not unit-tested (would require a fake browser that produces 5xx
// then 200); behaviour anchored by the retry count + backoff constants.
func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) {
var lastErr error
for attempt := 0; attempt <= frontPageRetries; attempt++ {
if err := ctx.Err(); err != nil {
return nil, err
}
doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err == nil {
return doc, nil
}
// ErrInvalidStatusCode with the doc kept open means the caller can
// promote it to interactive (captcha). We don't retry these — the
// underlying page is what the caller wants.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
if isTransientStatus(err) && attempt < frontPageRetries {
// 5xx — close, back off, retry.
_ = doc.Close()
lastErr = err
slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err)
if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) {
return nil, ctx.Err()
}
continue
}
return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
if doc != nil {
_ = doc.Close()
}
lastErr = err
// Don't retry on non-status errors (browser-level failures).
return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr)
}
// isTransientStatus reports whether err wraps an HTTP 5xx status from
// ErrInvalidStatusCode that we should retry.
func isTransientStatus(err error) bool {
if !errors.Is(err, extractor.ErrInvalidStatusCode) {
return false
}
// The wrapped error message has the form "invalid status code: <n>".
// Parse the trailing integer for the 5xx check.
msg := err.Error()
// Find the last space and parse what follows.
idx := strings.LastIndex(msg, " ")
if idx < 0 || idx == len(msg)-1 {
return false
}
tail := msg[idx+1:]
if len(tail) != 3 {
return false
}
return tail[0] == '5'
}
// sleepOrCancel blocks for d, returning true if it slept the full duration
// and false if ctx was cancelled first.
func sleepOrCancel(ctx context.Context, d time.Duration) bool {
timer := time.NewTimer(d)
defer timer.Stop()
select {
case <-ctx.Done():
return false
case <-timer.C:
return true
}
}
// findURLInput tries each selector in urlInputSelectors until one matches,
// returning the Node and the selector that produced it.
func findURLInput(doc extractor.Document) (extractor.Node, string) {
for _, sel := range urlInputSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// findSubmitButton tries each selector in submitButtonSelectors until one
// matches, returning the Node and the selector that produced it.
func findSubmitButton(doc extractor.Document) (extractor.Node, string) {
for _, sel := range submitButtonSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// pollUntilArchived watches doc until its URL transitions to a final
// archive.ph snapshot URL AND a known completion DOM marker is present.
// Returns ErrArchiveIncomplete if the context fires while still on /wip/
// or /submit, and ctx.Err() if the context was cancelled by the caller for
// other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because
// it's almost always the configured Timeout firing).
func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error {
ticker := time.NewTicker(defaultPollInterval)
defer ticker.Stop()
progressTicker := time.NewTicker(defaultProgressLogInterval)
defer progressTicker.Stop()
for {
// Check on entry as well, so a context that's already cancelled
// produces a useful error rather than a spurious "incomplete".
if err := ctx.Err(); err != nil {
return classifyPollError(err, doc)
}
if isArchiveComplete(doc, endpoint) {
slog.Info("archive: snapshot complete", "url", doc.URL())
return nil
}
select {
case <-ctx.Done():
return classifyPollError(ctx.Err(), doc)
case <-progressTicker.C:
slog.Info("archive: still waiting for archive.ph", "url", doc.URL())
case <-ticker.C:
// fall through to top-of-loop completion check
}
}
}
// classifyPollError maps a context error into either ErrArchiveIncomplete
// (when the doc is still on a /wip/ or /submit page and the timeout fired)
// or the underlying ctx error (when the caller cancelled for other reasons).
func classifyPollError(ctxErr error, doc extractor.Document) error {
if ctxErr == nil {
return nil
}
currentURL := doc.URL()
if errors.Is(ctxErr, context.DeadlineExceeded) {
return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr)
}
return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr)
}
// isArchiveComplete reports whether doc's URL and DOM indicate a finished
// archive.ph snapshot. Both signals must agree: a URL transition alone is
// not enough (the /wip/ page can occasionally redirect to a stub before
// content lands), and a DOM marker alone is not enough (the front page's
// own markup overlaps slightly).
func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool {
current, err := url.Parse(doc.URL())
if err != nil {
return false
}
if !isFinalSnapshotURL(current, endpoint) {
return false
}
return hasCompletionMarker(doc)
}
// isFinalSnapshotURL reports whether u looks like a finished archive.ph
// snapshot URL. The hostname must match the endpoint and the path must
// match archivedIDPattern. /wip/, /submit and the front page are rejected.
func isFinalSnapshotURL(u, endpoint *url.URL) bool {
if u == nil || endpoint == nil {
return false
}
if u.Hostname() != endpoint.Hostname() {
// A redirect off-host (e.g. to the originally-archived URL) is
// unusual for archive.ph but if it happened we'd accept it: the
// snapshot was clearly produced and the caller asked us to land
// somewhere useful.
return u.Hostname() != ""
}
for _, prefix := range pendingPathPatterns {
if strings.HasPrefix(u.Path, prefix) {
return false
}
}
if u.Path == "" || u.Path == "/" {
return false
}
return archivedIDPattern.MatchString(u.Path)
}
// hasCompletionMarker reports whether doc has at least one of the known
// archive.ph completion DOM markers.
func hasCompletionMarker(doc extractor.Node) bool {
for _, sel := range completionSelectors {
if n := doc.SelectFirst(sel); n != nil {
return true
}
}
return false
}