Compare commits

...

2 Commits

Author SHA1 Message Date
steve cccf3c4f83 Merge pull request 'fix(archive): harden archive.ph submit/poll flow' (#87) from fix/archive-ph-poll-hardening into main
CI / build (push) Successful in 38s
CI / vet (push) Successful in 48s
CI / test (push) Successful in 49s
2026-05-15 22:39:40 +00:00
steve 45fa7c4e8f fix(archive): harden archive.ph submit/poll flow
CI / build (pull_request) Successful in 1m5s
CI / vet (pull_request) Successful in 1m26s
CI / test (pull_request) Successful in 1m27s
The archive.ph submission flow had several defects that caused Mort's
summary fallback to return placeholder "Working..." pages instead of
real archived content, or hang for the full timeout:

- Context cancellation in the poll loop fell through to a final
  WaitForNetworkIdle and returned the doc as success. The function now
  returns a typed error (ErrArchiveIncomplete on deadline, wrapped
  ctx.Err() on caller cancel).
- The poll only checked doc.URL() — if archive.ph's JS got wedged on
  /wip/<id>, the loop spun until timeout. Completion now also requires
  a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only
  transitions don't satisfy the check.
- The final URL is now validated against an alphanumeric ID pattern,
  rejecting /wip/, /submit, /newest/ and the front page.
- 5-second blind sleep before polling replaced with a bounded
  WaitForNetworkIdle that short-circuits when already archived.
- Form selectors now use a cascade (input[name='url'] →
  input[type='url'] → input.input-url → input[name='anyway'], and
  similar for the submit button) so a single archive.ph markup change
  doesn't kill the flow. Errors name which selectors were tried.
- Default timeout lowered from 1 hour to 5 minutes (still overridable
  via context deadline). Exposed as DefaultTimeout.
- Poll progress is now logged at slog.Info every 30s so production logs
  surface stuck flows.
- Front-page 5xx now retries twice with 1s/4s backoff before failing.
- New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing.
- Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix),
  selector cascade, DOM completion detector, transient status
  classification, and ctx cancellation paths via a thread-safe mutating
  mock document. Full integration with a live browser remains hand-tested.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 17:23:24 -04:00
2 changed files with 840 additions and 78 deletions
+387 -67
View File
@@ -1,3 +1,17 @@
// Package archive provides a thin wrapper around archive.ph (a.k.a.
// archive.today) for two operations:
//
// - IsArchived: check whether a target URL already has a snapshot, and
// return the snapshot Document if so.
// - Archive: submit a target URL to archive.ph, poll until the snapshot is
// complete (or the context is cancelled), and return the resulting
// Document.
//
// The submit flow is intentionally defensive: archive.ph occasionally rotates
// its form markup, its front page sometimes 5xx's, and the in-progress
// "/wip/<id>" pages can hang indefinitely if their JS gets wedged. The
// implementation in this file documents and tests each of those failure modes
// rather than papering over them.
package archive
import (
@@ -6,26 +20,121 @@ import (
"fmt"
"log/slog"
"net/url"
"regexp"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
// ErrArchiveIncomplete is returned when archive.ph never transitions away
// from the /wip/ (work-in-progress) or /submit placeholder pages within the
// configured timeout. Callers can errors.Is against this sentinel to
// distinguish "archive.ph is slow / wedged" from "we got cancelled".
var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout")
// ErrArchiveSelectorMissing is returned when archive.ph's front-page submit
// form cannot be found by any of the known fallback selectors. This usually
// means archive.ph rotated its markup and the cascade in this file needs to
// be updated.
var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph")
// urlInputSelectors and submitButtonSelectors are tried in order when
// locating the submit form on archive.ph's front page. Updating one of
// these in response to archive.ph DOM churn should not require touching
// the rest of the file.
var urlInputSelectors = []string{
"input[name='url']",
"input[type='url']",
"input.input-url",
"input[name='anyway']",
}
var submitButtonSelectors = []string{
"form#submiturl input[type='submit']",
"form#submiturl button[type='submit']",
"input[type='submit'][value*='save' i]",
"button[type='submit']",
}
// completionSelectors are DOM markers that, when present, indicate the page
// is a finished archived snapshot rather than the /wip/ placeholder.
// archive.ph snapshots wrap the page in a header bar + share box; both
// vary slightly across snapshots so we accept any of them.
var completionSelectors = []string{
"div#HEADER",
"#HEADER",
"div[id^='SHARE']",
"#SHARE",
"div.TEXT-BLOCK",
".TEXT-BLOCK",
}
// archivedIDPattern matches a final archive.ph snapshot URL path.
// archive.ph identifiers are short alphanumeric codes (typically 5+ chars)
// and the snapshot URL is either:
//
// https://archive.ph/<id>
// https://archive.ph/<id>/<original-url>
// https://archive.ph/o/<id>
// https://archive.ph/o/<id>/<original-url>
//
// The pattern matches the path leading character set; callers should also
// check the hostname matches archive.ph (or whatever endpoint was configured).
var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`)
// pendingPathPatterns lists path prefixes that mean the snapshot is NOT
// finished yet — work-in-progress, the submit endpoint, or the lookup
// endpoint (/newest/<url>) which redirects through to a snapshot URL.
var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"}
const (
// DefaultTimeout is the default upper bound on a single Archive call.
// archive.ph normally finishes within seconds; the 5-minute ceiling is
// generous enough to cover slow targets while still surfacing wedged
// flows to the caller in a reasonable time.
DefaultTimeout = 5 * time.Minute
// defaultPollInterval is how often the polling loop re-checks the
// document's URL and DOM for completion markers. archive.ph snapshots
// typically finish within seconds; a tight interval makes the call
// return promptly without measurable cost (a couple of DOM selectors
// against an already-open page).
defaultPollInterval = 1 * time.Second
// defaultProgressLogInterval is how often the polling loop emits a
// slog.Info progress line so production logs surface stuck flows.
defaultProgressLogInterval = 30 * time.Second
// defaultInitialIdleWait is the soft cap on how long we wait for the
// initial post-submit page to settle. Already-archived URLs typically
// redirect to the snapshot almost immediately; new submissions take
// longer and the poll loop picks up after this.
defaultInitialIdleWait = 8 * time.Second
// frontPageRetries is the number of additional attempts to open the
// archive.ph front page when it returns a 5xx (their own infra
// occasionally hiccups).
frontPageRetries = 2
)
// frontPageBackoffs is the backoff schedule between front-page retries.
// len(frontPageBackoffs) must equal frontPageRetries.
var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second}
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
// If nil, DefaultTimeout (5 minutes) is used.
Timeout *time.Duration
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := 1 * time.Hour
def := DefaultTimeout
c.Timeout = &def
}
@@ -38,7 +147,13 @@ func (c Config) validate() Config {
var DefaultConfig = Config{}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
// IsArchived checks if a url is archived. It returns the archived Document if
// it is archived, or (nil, nil) if archive.ph has no snapshot for it.
//
// Why: callers (e.g. Mort's summary system) want to avoid submitting an
// archive request when one already exists.
// What: opens archive.ph/newest/<target> and returns the resulting Document.
// Test: see archive_test.go TestIsArchived_* (mock-browser based).
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
@@ -52,13 +167,11 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil {
@@ -82,19 +195,35 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
return doc, nil
}
// IsArchived is a convenience wrapper around DefaultConfig.IsArchived.
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
// Archive submits target to archive.ph and polls until the snapshot is
// complete (transitioned away from /wip/ AND a known DOM completion marker
// is present), or the context is cancelled / the timeout fires.
//
// Why: when Mort's summary system gets bot-checked on the live site it
// falls back to reading the archive.ph snapshot. The previous implementation
// was happy to return mid-submission /wip/ pages as "success" (placeholder
// "Working..." pages with no real content) which made the fallback useless.
// What: opens archive.ph's front page, types the target URL into the submit
// form, clicks submit, and polls for completion. Returns a typed error if
// archive.ph doesn't finish in time so callers can errors.Is and degrade.
// Test: archive_test.go covers the URL validator, selector cascade, the
// completion detector, and the ctx-cancellation path. The full integration
// flow requires a live browser + archive.ph and is hand-tested.
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout)
slog.Info("archive: setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
@@ -105,87 +234,278 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
doc, err := openArchiveFrontPage(ctx, b, c.Endpoint)
if err != nil {
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
return nil, err
}
urlInput := doc.SelectFirst("input[name='url']")
// Fill and submit the form. doc has the page; any error past this point
// must close it.
urlInput, urlSelector := findURLInput(doc)
if urlInput == nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to find url input element")
return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors)
}
err = urlInput.Type(u.String())
if err != nil {
if err = urlInput.Type(u.String()); err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err)
return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err)
}
submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]")
submitBtn, submitSelector := findSubmitButton(doc)
if submitBtn == nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to find submit button")
return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors)
}
err = submitBtn.Click()
if err != nil {
if err = submitBtn.Click(); err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err)
return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err)
}
// wait for the page to load, but respect context cancellation
select {
case <-ctx.Done():
slog.Debug("context done during initial wait", "err", ctx.Err())
// Initial soft idle wait so the post-submit redirect has a chance to
// land before we start polling. Already-archived URLs short-circuit
// here; new submissions fall through to the polling loop.
initialWait := defaultInitialIdleWait
if err = doc.WaitForNetworkIdle(&initialWait); err != nil {
// Network-idle timing out is normal on archive.ph during a fresh
// submission (the /wip/ page polls itself). Don't treat it as
// fatal — let the polling loop decide.
slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err)
}
if err = ctx.Err(); err != nil {
_ = doc.Close()
return nil, ctx.Err()
case <-time.After(5 * time.Second):
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-ticker.C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
return nil, err
}
slog.Debug("checking url", "url", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
// Poll until either (a) the page transitions to a finished snapshot,
// (b) the context is cancelled, or (c) the timeout fires (which also
// cancels ctx).
if err = pollUntilArchived(ctx, doc, endpoint); err != nil {
_ = doc.Close()
return nil, err
}
return doc, doc.WaitForNetworkIdle(nil)
// Final settle: best-effort wait for in-flight asset loads on the
// snapshot itself so a downstream Readability call sees stable DOM.
settle := 10 * time.Second
if err = doc.WaitForNetworkIdle(&settle); err != nil {
slog.Debug("archive: final WaitForNetworkIdle returned", "err", err)
}
return doc, nil
}
// Archive is a convenience wrapper around DefaultConfig.Archive.
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}
// openArchiveFrontPage opens the archive.ph front page, retrying up to
// frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a
// non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately
// along with the open document, mirroring the IsArchived contract so a
// caller can promote it to an InteractiveBrowser.
//
// Why: archive.ph's own infrastructure occasionally serves 5xx during
// load spikes; a single retry generally clears it.
// What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx.
// Test: not unit-tested (would require a fake browser that produces 5xx
// then 200); behaviour anchored by the retry count + backoff constants.
func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) {
var lastErr error
for attempt := 0; attempt <= frontPageRetries; attempt++ {
if err := ctx.Err(); err != nil {
return nil, err
}
doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err == nil {
return doc, nil
}
// ErrInvalidStatusCode with the doc kept open means the caller can
// promote it to interactive (captcha). We don't retry these — the
// underlying page is what the caller wants.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
if isTransientStatus(err) && attempt < frontPageRetries {
// 5xx — close, back off, retry.
_ = doc.Close()
lastErr = err
slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err)
if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) {
return nil, ctx.Err()
}
continue
}
return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
if doc != nil {
_ = doc.Close()
}
lastErr = err
// Don't retry on non-status errors (browser-level failures).
return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr)
}
// isTransientStatus reports whether err wraps an HTTP 5xx status from
// ErrInvalidStatusCode that we should retry.
func isTransientStatus(err error) bool {
if !errors.Is(err, extractor.ErrInvalidStatusCode) {
return false
}
// The wrapped error message has the form "invalid status code: <n>".
// Parse the trailing integer for the 5xx check.
msg := err.Error()
// Find the last space and parse what follows.
idx := strings.LastIndex(msg, " ")
if idx < 0 || idx == len(msg)-1 {
return false
}
tail := msg[idx+1:]
if len(tail) != 3 {
return false
}
return tail[0] == '5'
}
// sleepOrCancel blocks for d, returning true if it slept the full duration
// and false if ctx was cancelled first.
func sleepOrCancel(ctx context.Context, d time.Duration) bool {
timer := time.NewTimer(d)
defer timer.Stop()
select {
case <-ctx.Done():
return false
case <-timer.C:
return true
}
}
// findURLInput tries each selector in urlInputSelectors until one matches,
// returning the Node and the selector that produced it.
func findURLInput(doc extractor.Document) (extractor.Node, string) {
for _, sel := range urlInputSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// findSubmitButton tries each selector in submitButtonSelectors until one
// matches, returning the Node and the selector that produced it.
func findSubmitButton(doc extractor.Document) (extractor.Node, string) {
for _, sel := range submitButtonSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// pollUntilArchived watches doc until its URL transitions to a final
// archive.ph snapshot URL AND a known completion DOM marker is present.
// Returns ErrArchiveIncomplete if the context fires while still on /wip/
// or /submit, and ctx.Err() if the context was cancelled by the caller for
// other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because
// it's almost always the configured Timeout firing).
func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error {
ticker := time.NewTicker(defaultPollInterval)
defer ticker.Stop()
progressTicker := time.NewTicker(defaultProgressLogInterval)
defer progressTicker.Stop()
for {
// Check on entry as well, so a context that's already cancelled
// produces a useful error rather than a spurious "incomplete".
if err := ctx.Err(); err != nil {
return classifyPollError(err, doc)
}
if isArchiveComplete(doc, endpoint) {
slog.Info("archive: snapshot complete", "url", doc.URL())
return nil
}
select {
case <-ctx.Done():
return classifyPollError(ctx.Err(), doc)
case <-progressTicker.C:
slog.Info("archive: still waiting for archive.ph", "url", doc.URL())
case <-ticker.C:
// fall through to top-of-loop completion check
}
}
}
// classifyPollError maps a context error into either ErrArchiveIncomplete
// (when the doc is still on a /wip/ or /submit page and the timeout fired)
// or the underlying ctx error (when the caller cancelled for other reasons).
func classifyPollError(ctxErr error, doc extractor.Document) error {
if ctxErr == nil {
return nil
}
currentURL := doc.URL()
if errors.Is(ctxErr, context.DeadlineExceeded) {
return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr)
}
return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr)
}
// isArchiveComplete reports whether doc's URL and DOM indicate a finished
// archive.ph snapshot. Both signals must agree: a URL transition alone is
// not enough (the /wip/ page can occasionally redirect to a stub before
// content lands), and a DOM marker alone is not enough (the front page's
// own markup overlaps slightly).
func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool {
current, err := url.Parse(doc.URL())
if err != nil {
return false
}
if !isFinalSnapshotURL(current, endpoint) {
return false
}
return hasCompletionMarker(doc)
}
// isFinalSnapshotURL reports whether u looks like a finished archive.ph
// snapshot URL. The hostname must match the endpoint and the path must
// match archivedIDPattern. /wip/, /submit and the front page are rejected.
func isFinalSnapshotURL(u, endpoint *url.URL) bool {
if u == nil || endpoint == nil {
return false
}
if u.Hostname() != endpoint.Hostname() {
// A redirect off-host (e.g. to the originally-archived URL) is
// unusual for archive.ph but if it happened we'd accept it: the
// snapshot was clearly produced and the caller asked us to land
// somewhere useful.
return u.Hostname() != ""
}
for _, prefix := range pendingPathPatterns {
if strings.HasPrefix(u.Path, prefix) {
return false
}
}
if u.Path == "" || u.Path == "/" {
return false
}
return archivedIDPattern.MatchString(u.Path)
}
// hasCompletionMarker reports whether doc has at least one of the known
// archive.ph completion DOM markers.
func hasCompletionMarker(doc extractor.Node) bool {
for _, sel := range completionSelectors {
if n := doc.SelectFirst(sel); n != nil {
return true
}
}
return false
}
+451 -9
View File
@@ -1,13 +1,23 @@
package archive
import (
"context"
"errors"
"fmt"
"net/url"
"sync"
"sync/atomic"
"testing"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
)
// --- Config validation ---------------------------------------------------
func TestConfig_Validate_Defaults(t *testing.T) {
c := Config{}
c = c.validate()
c := Config{}.validate()
if c.Endpoint != "https://archive.ph" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
@@ -15,23 +25,455 @@ func TestConfig_Validate_Defaults(t *testing.T) {
if c.Timeout == nil {
t.Fatal("Timeout should not be nil after validate")
}
if *c.Timeout != 1*time.Hour {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
if *c.Timeout != DefaultTimeout {
t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
}
if DefaultTimeout != 5*time.Minute {
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
}
}
func TestConfig_Validate_Preserves(t *testing.T) {
timeout := 5 * time.Minute
timeout := 30 * time.Second
c := Config{
Endpoint: "https://archive.org",
Timeout: &timeout,
}
c = c.validate()
}.validate()
if c.Endpoint != "https://archive.org" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
}
if *c.Timeout != 5*time.Minute {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
if *c.Timeout != 30*time.Second {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
}
}
// --- URL validation ------------------------------------------------------
func TestIsFinalSnapshotURL(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
want bool
}{
{"front-page-empty", "https://archive.ph/", false},
{"front-page-bare", "https://archive.ph", false},
{"wip", "https://archive.ph/wip/abc12", false},
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
{"submit-bare", "https://archive.ph/submit", false},
{"short-id-too-short", "https://archive.ph/ab", false},
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
{"short-id-5chars", "https://archive.ph/i9KU2", true},
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
{"o-prefix", "https://archive.ph/o/i9KU2", true},
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
u, err := url.Parse(tc.raw)
if err != nil {
t.Fatalf("parse %q: %v", tc.raw, err)
}
got := isFinalSnapshotURL(u, endpoint)
if got != tc.want {
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
}
})
}
}
// --- DOM completion marker -----------------------------------------------
func TestHasCompletionMarker(t *testing.T) {
t.Run("no markers", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
if hasCompletionMarker(doc) {
t.Error("expected no completion marker on empty doc")
}
})
for _, sel := range completionSelectors {
sel := sel
t.Run("marker "+sel, func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
sel: {&extractortest.MockNode{}},
},
},
}
if !hasCompletionMarker(doc) {
t.Errorf("expected completion marker via %q", sel)
}
})
}
}
// --- Selector cascade ----------------------------------------------------
func TestFindURLInput_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[0]: {&extractortest.MockNode{}},
urlInputSelectors[1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != urlInputSelectors[0] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
}
})
t.Run("falls back through cascade", func(t *testing.T) {
// Only the LAST selector matches.
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, sel := findURLInput(doc)
if n != nil {
t.Error("expected nil node")
}
if sel != "" {
t.Errorf("selector = %q, want empty", sel)
}
})
}
func TestFindSubmitButton_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
submitButtonSelectors[0]: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != submitButtonSelectors[0] {
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
}
})
t.Run("falls back to button[type='submit']", func(t *testing.T) {
// Use a known later-in-list selector.
target := submitButtonSelectors[len(submitButtonSelectors)-1]
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
target: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != target {
t.Errorf("selector = %q, want %q", sel, target)
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, _ := findSubmitButton(doc)
if n != nil {
t.Error("expected nil node")
}
})
}
// --- Transient status detection -----------------------------------------
func TestIsTransientStatus(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{"nil", nil, false},
{"plain error", errors.New("oops"), false},
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := isTransientStatus(tc.err); got != tc.want {
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
}
})
}
}
// --- mutDoc: a Document whose URL + Children can be swapped under load --
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
// so the polling loop sees consistent values from another goroutine.
type mutDoc struct {
mu sync.Mutex
urlValue atomic.Value // string
children atomic.Value // map[string]extractor.Nodes
}
var _ extractor.Document = (*mutDoc)(nil)
func newMutDoc(initialURL string) *mutDoc {
d := &mutDoc{}
d.urlValue.Store(initialURL)
d.children.Store(map[string]extractor.Nodes{})
return d
}
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
d.mu.Lock()
defer d.mu.Unlock()
d.children.Store(c)
}
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
func (d *mutDoc) Refresh() error { return nil }
func (d *mutDoc) Close() error { return nil }
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
func (d *mutDoc) Content() (string, error) { return "", nil }
func (d *mutDoc) Text() (string, error) { return "", nil }
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
func (d *mutDoc) Type(_ string) error { return nil }
func (d *mutDoc) Click() error { return nil }
func (d *mutDoc) SetHidden(_ bool) error { return nil }
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
func (d *mutDoc) Select(selector string) extractor.Nodes {
c := d.children.Load().(map[string]extractor.Nodes)
return c[selector]
}
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
return d.Select(selector).First()
}
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
for _, n := range d.Select(selector) {
if err := fn(n); err != nil {
return err
}
}
return nil
}
// --- pollUntilArchived ---------------------------------------------------
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
// No completion markers; URL stays on /wip/.
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
}
}
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithCancel(context.Background())
// Cancel after a brief delay so the polling loop is already inside its
// select.
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.Canceled) {
t.Errorf("expected wrapped context.Canceled, got %v", err)
}
}
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// After a short delay, transition to a final URL but WITHOUT a DOM
// marker. Poll should keep waiting. Then add the marker.
go func() {
time.Sleep(40 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2")
// No marker yet — poll should still wait.
time.Sleep(60 * time.Millisecond)
doc.setChildren(map[string]extractor.Nodes{
"div#HEADER": {&extractortest.MockNode{}},
})
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err != nil {
t.Fatalf("expected nil after URL+marker transition, got %v", err)
}
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
}
}
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
// URL transitions to a final-looking path but the DOM never grows a
// completion marker. Poll should hit the deadline.
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
go func() {
time.Sleep(10 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
}()
err := pollUntilArchived(ctx, doc, endpoint)
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
}
}
// --- isArchiveComplete combination ---------------------------------------
func TestIsArchiveComplete(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
marker bool
want bool
}{
{"both ok", "https://archive.ph/i9KU2", true, true},
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
{"final url no marker", "https://archive.ph/i9KU2", false, false},
{"front page with marker", "https://archive.ph/", true, false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
children := map[string]extractor.Nodes{}
if tc.marker {
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
}
doc := &extractortest.MockDocument{
URLValue: tc.raw,
MockNode: extractortest.MockNode{Children: children},
}
got := isArchiveComplete(doc, endpoint)
if got != tc.want {
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
}
})
}
}
// --- Archive: selector cascade failure path ------------------------------
// Note: the full Archive() flow drives a live browser. We can still cover
// the "form selectors all missing" branch via a custom Browser that returns
// a mutDoc with no children — the URL/typing path doesn't run because the
// selector lookup fails first.
type emptyFormBrowser struct {
doc extractor.Document
}
func (b *emptyFormBrowser) Close() error { return nil }
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
return b.doc, nil
}
func TestArchive_SelectorMissing(t *testing.T) {
doc := &extractortest.MockDocument{
URLValue: "https://archive.ph/",
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
}
b := &emptyFormBrowser{doc: doc}
timeout := 200 * time.Millisecond
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
if err == nil {
t.Fatal("expected error when form selectors are missing")
}
if !errors.Is(err, ErrArchiveSelectorMissing) {
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
}
}
// --- helpers -------------------------------------------------------------
func mustParse(t *testing.T, raw string) *url.URL {
t.Helper()
u, err := url.Parse(raw)
if err != nil {
t.Fatalf("parse %q: %v", raw, err)
}
return u
}