fix(archive): harden archive.ph submit/poll flow #87

Merged
steve merged 1 commits from fix/archive-ph-poll-hardening into main 2026-05-15 22:39:40 +00:00
2 changed files with 840 additions and 78 deletions
+389 -69
View File
@@ -1,3 +1,17 @@
// Package archive provides a thin wrapper around archive.ph (a.k.a.
// archive.today) for two operations:
//
// - IsArchived: check whether a target URL already has a snapshot, and
// return the snapshot Document if so.
// - Archive: submit a target URL to archive.ph, poll until the snapshot is
// complete (or the context is cancelled), and return the resulting
// Document.
//
// The submit flow is intentionally defensive: archive.ph occasionally rotates
// its form markup, its front page sometimes 5xx's, and the in-progress
// "/wip/<id>" pages can hang indefinitely if their JS gets wedged. The
// implementation in this file documents and tests each of those failure modes
// rather than papering over them.
package archive package archive
import ( import (
@@ -6,26 +20,121 @@ import (
"fmt" "fmt"
"log/slog" "log/slog"
"net/url" "net/url"
"regexp"
"strings" "strings"
"time" "time"
"gitea.stevedudenhoeffer.com/steve/go-extractor" "gitea.stevedudenhoeffer.com/steve/go-extractor"
) )
// ErrArchiveIncomplete is returned when archive.ph never transitions away
// from the /wip/ (work-in-progress) or /submit placeholder pages within the
// configured timeout. Callers can errors.Is against this sentinel to
// distinguish "archive.ph is slow / wedged" from "we got cancelled".
var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout")
// ErrArchiveSelectorMissing is returned when archive.ph's front-page submit
// form cannot be found by any of the known fallback selectors. This usually
// means archive.ph rotated its markup and the cascade in this file needs to
// be updated.
var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph")
// urlInputSelectors and submitButtonSelectors are tried in order when
// locating the submit form on archive.ph's front page. Updating one of
// these in response to archive.ph DOM churn should not require touching
// the rest of the file.
var urlInputSelectors = []string{
"input[name='url']",
"input[type='url']",
"input.input-url",
"input[name='anyway']",
}
var submitButtonSelectors = []string{
"form#submiturl input[type='submit']",
"form#submiturl button[type='submit']",
"input[type='submit'][value*='save' i]",
"button[type='submit']",
}
// completionSelectors are DOM markers that, when present, indicate the page
// is a finished archived snapshot rather than the /wip/ placeholder.
// archive.ph snapshots wrap the page in a header bar + share box; both
// vary slightly across snapshots so we accept any of them.
var completionSelectors = []string{
"div#HEADER",
"#HEADER",
"div[id^='SHARE']",
"#SHARE",
"div.TEXT-BLOCK",
".TEXT-BLOCK",
}
// archivedIDPattern matches a final archive.ph snapshot URL path.
// archive.ph identifiers are short alphanumeric codes (typically 5+ chars)
// and the snapshot URL is either:
//
// https://archive.ph/<id>
// https://archive.ph/<id>/<original-url>
// https://archive.ph/o/<id>
// https://archive.ph/o/<id>/<original-url>
//
// The pattern matches the path leading character set; callers should also
// check the hostname matches archive.ph (or whatever endpoint was configured).
var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`)
// pendingPathPatterns lists path prefixes that mean the snapshot is NOT
// finished yet — work-in-progress, the submit endpoint, or the lookup
// endpoint (/newest/<url>) which redirects through to a snapshot URL.
var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"}
const (
// DefaultTimeout is the default upper bound on a single Archive call.
// archive.ph normally finishes within seconds; the 5-minute ceiling is
// generous enough to cover slow targets while still surfacing wedged
// flows to the caller in a reasonable time.
DefaultTimeout = 5 * time.Minute
// defaultPollInterval is how often the polling loop re-checks the
// document's URL and DOM for completion markers. archive.ph snapshots
// typically finish within seconds; a tight interval makes the call
// return promptly without measurable cost (a couple of DOM selectors
// against an already-open page).
defaultPollInterval = 1 * time.Second
// defaultProgressLogInterval is how often the polling loop emits a
// slog.Info progress line so production logs surface stuck flows.
defaultProgressLogInterval = 30 * time.Second
// defaultInitialIdleWait is the soft cap on how long we wait for the
// initial post-submit page to settle. Already-archived URLs typically
// redirect to the snapshot almost immediately; new submissions take
// longer and the poll loop picks up after this.
defaultInitialIdleWait = 8 * time.Second
// frontPageRetries is the number of additional attempts to open the
// archive.ph front page when it returns a 5xx (their own infra
// occasionally hiccups).
frontPageRetries = 2
)
// frontPageBackoffs is the backoff schedule between front-page retries.
// len(frontPageBackoffs) must equal frontPageRetries.
var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second}
type Config struct { type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used. // Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string Endpoint string
// Timeout will, if set, cancel any Archive call after this duration. // Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used. // If nil, DefaultTimeout (5 minutes) is used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour Timeout *time.Duration
} }
// validate validates the config and sets default values if necessary. // validate validates the config and sets default values if necessary.
func (c Config) validate() Config { func (c Config) validate() Config {
if c.Timeout == nil { if c.Timeout == nil {
def := 1 * time.Hour def := DefaultTimeout
c.Timeout = &def c.Timeout = &def
} }
@@ -38,7 +147,13 @@ func (c Config) validate() Config {
var DefaultConfig = Config{} var DefaultConfig = Config{}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not. // IsArchived checks if a url is archived. It returns the archived Document if
// it is archived, or (nil, nil) if archive.ph has no snapshot for it.
//
// Why: callers (e.g. Mort's summary system) want to avoid submitting an
// archive request when one already exists.
// What: opens archive.ph/newest/<target> and returns the resulting Document.
// Test: see archive_test.go TestIsArchived_* (mock-browser based).
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate() c = c.validate()
u, err := url.Parse(target) u, err := url.Parse(target)
@@ -52,13 +167,11 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
} }
uri := endpoint.JoinPath("/newest") uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String() uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint) slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true}) doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})
if err != nil { if err != nil {
if errors.Is(err, extractor.ErrPageNotFound) { if errors.Is(err, extractor.ErrPageNotFound) {
if doc != nil { if doc != nil {
@@ -82,19 +195,35 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri
return doc, nil return doc, nil
} }
// IsArchived is a convenience wrapper around DefaultConfig.IsArchived.
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target) return DefaultConfig.IsArchived(ctx, b, target)
} }
// Archive submits target to archive.ph and polls until the snapshot is
// complete (transitioned away from /wip/ AND a known DOM completion marker
// is present), or the context is cancelled / the timeout fires.
//
// Why: when Mort's summary system gets bot-checked on the live site it
// falls back to reading the archive.ph snapshot. The previous implementation
// was happy to return mid-submission /wip/ pages as "success" (placeholder
// "Working..." pages with no real content) which made the fallback useless.
// What: opens archive.ph's front page, types the target URL into the submit
// form, clicks submit, and polls for completion. Returns a typed error if
// archive.ph doesn't finish in time so callers can errors.Is and degrade.
// Test: archive_test.go covers the URL validator, selector cascade, the
// completion detector, and the ctx-cancellation path. The full integration
// flow requires a live browser + archive.ph and is hand-tested.
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate() c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil { if c.Timeout != nil {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, *c.Timeout) ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout) slog.Info("archive: setting timeout", "timeout", *c.Timeout)
defer cancel() defer cancel()
} }
u, err := url.Parse(target) u, err := url.Parse(target)
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid url: %w", err) return nil, fmt.Errorf("invalid url: %w", err)
@@ -105,87 +234,278 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
return nil, fmt.Errorf("invalid endpoint: %w", err) return nil, fmt.Errorf("invalid endpoint: %w", err)
} }
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true}) doc, err := openArchiveFrontPage(ctx, b, c.Endpoint)
if err != nil { if err != nil {
// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare return nil, err
// captcha) the page is kept open by AllowNonOKStatus so the caller
// can promote it. Return both the doc and the wrapped error.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
return doc, fmt.Errorf("failed to open url: %w", err)
}
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
} }
urlInput := doc.SelectFirst("input[name='url']") // Fill and submit the form. doc has the page; any error past this point
// must close it.
urlInput, urlSelector := findURLInput(doc)
if urlInput == nil { if urlInput == nil {
_ = doc.Close() _ = doc.Close()
return nil, fmt.Errorf("failed to find url input element") return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors)
} }
err = urlInput.Type(u.String()) if err = urlInput.Type(u.String()); err != nil {
if err != nil {
_ = doc.Close() _ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err) return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err)
} }
submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]") submitBtn, submitSelector := findSubmitButton(doc)
if submitBtn == nil { if submitBtn == nil {
_ = doc.Close() _ = doc.Close()
return nil, fmt.Errorf("failed to find submit button") return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors)
} }
err = submitBtn.Click() if err = submitBtn.Click(); err != nil {
if err != nil {
_ = doc.Close() _ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err) return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err)
} }
// wait for the page to load, but respect context cancellation // Initial soft idle wait so the post-submit redirect has a chance to
select { // land before we start polling. Already-archived URLs short-circuit
case <-ctx.Done(): // here; new submissions fall through to the polling loop.
slog.Debug("context done during initial wait", "err", ctx.Err()) initialWait := defaultInitialIdleWait
if err = doc.WaitForNetworkIdle(&initialWait); err != nil {
// Network-idle timing out is normal on archive.ph during a fresh
// submission (the /wip/ page polls itself). Don't treat it as
// fatal — let the polling loop decide.
slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err)
}
if err = ctx.Err(); err != nil {
_ = doc.Close() _ = doc.Close()
return nil, ctx.Err() return nil, err
case <-time.After(5 * time.Second):
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-ticker.C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
}
slog.Debug("checking url", "url", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
} }
return doc, doc.WaitForNetworkIdle(nil) // Poll until either (a) the page transitions to a finished snapshot,
// (b) the context is cancelled, or (c) the timeout fires (which also
// cancels ctx).
if err = pollUntilArchived(ctx, doc, endpoint); err != nil {
_ = doc.Close()
return nil, err
}
// Final settle: best-effort wait for in-flight asset loads on the
// snapshot itself so a downstream Readability call sees stable DOM.
settle := 10 * time.Second
if err = doc.WaitForNetworkIdle(&settle); err != nil {
slog.Debug("archive: final WaitForNetworkIdle returned", "err", err)
}
return doc, nil
} }
// Archive is a convenience wrapper around DefaultConfig.Archive.
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target) return DefaultConfig.Archive(ctx, b, target)
} }
// openArchiveFrontPage opens the archive.ph front page, retrying up to
// frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a
// non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately
// along with the open document, mirroring the IsArchived contract so a
// caller can promote it to an InteractiveBrowser.
//
// Why: archive.ph's own infrastructure occasionally serves 5xx during
// load spikes; a single retry generally clears it.
// What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx.
// Test: not unit-tested (would require a fake browser that produces 5xx
// then 200); behaviour anchored by the retry count + backoff constants.
func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) {
var lastErr error
for attempt := 0; attempt <= frontPageRetries; attempt++ {
if err := ctx.Err(); err != nil {
return nil, err
}
doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})
if err == nil {
return doc, nil
}
// ErrInvalidStatusCode with the doc kept open means the caller can
// promote it to interactive (captcha). We don't retry these — the
// underlying page is what the caller wants.
if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
if isTransientStatus(err) && attempt < frontPageRetries {
// 5xx — close, back off, retry.
_ = doc.Close()
lastErr = err
slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err)
if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) {
return nil, ctx.Err()
}
continue
}
return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
if doc != nil {
_ = doc.Close()
}
lastErr = err
// Don't retry on non-status errors (browser-level failures).
return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err)
}
return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr)
}
// isTransientStatus reports whether err wraps an HTTP 5xx status from
// ErrInvalidStatusCode that we should retry.
func isTransientStatus(err error) bool {
if !errors.Is(err, extractor.ErrInvalidStatusCode) {
return false
}
// The wrapped error message has the form "invalid status code: <n>".
// Parse the trailing integer for the 5xx check.
msg := err.Error()
// Find the last space and parse what follows.
idx := strings.LastIndex(msg, " ")
if idx < 0 || idx == len(msg)-1 {
return false
}
tail := msg[idx+1:]
if len(tail) != 3 {
return false
}
return tail[0] == '5'
}
// sleepOrCancel blocks for d, returning true if it slept the full duration
// and false if ctx was cancelled first.
func sleepOrCancel(ctx context.Context, d time.Duration) bool {
timer := time.NewTimer(d)
defer timer.Stop()
select {
case <-ctx.Done():
return false
case <-timer.C:
return true
}
}
// findURLInput tries each selector in urlInputSelectors until one matches,
// returning the Node and the selector that produced it.
func findURLInput(doc extractor.Document) (extractor.Node, string) {
for _, sel := range urlInputSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// findSubmitButton tries each selector in submitButtonSelectors until one
// matches, returning the Node and the selector that produced it.
func findSubmitButton(doc extractor.Document) (extractor.Node, string) {
for _, sel := range submitButtonSelectors {
if n := doc.SelectFirst(sel); n != nil {
return n, sel
}
}
return nil, ""
}
// pollUntilArchived watches doc until its URL transitions to a final
// archive.ph snapshot URL AND a known completion DOM marker is present.
// Returns ErrArchiveIncomplete if the context fires while still on /wip/
// or /submit, and ctx.Err() if the context was cancelled by the caller for
// other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because
// it's almost always the configured Timeout firing).
func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error {
ticker := time.NewTicker(defaultPollInterval)
defer ticker.Stop()
progressTicker := time.NewTicker(defaultProgressLogInterval)
defer progressTicker.Stop()
for {
// Check on entry as well, so a context that's already cancelled
// produces a useful error rather than a spurious "incomplete".
if err := ctx.Err(); err != nil {
return classifyPollError(err, doc)
}
if isArchiveComplete(doc, endpoint) {
slog.Info("archive: snapshot complete", "url", doc.URL())
return nil
}
select {
case <-ctx.Done():
return classifyPollError(ctx.Err(), doc)
case <-progressTicker.C:
slog.Info("archive: still waiting for archive.ph", "url", doc.URL())
case <-ticker.C:
// fall through to top-of-loop completion check
}
}
}
// classifyPollError maps a context error into either ErrArchiveIncomplete
// (when the doc is still on a /wip/ or /submit page and the timeout fired)
// or the underlying ctx error (when the caller cancelled for other reasons).
func classifyPollError(ctxErr error, doc extractor.Document) error {
if ctxErr == nil {
return nil
}
currentURL := doc.URL()
if errors.Is(ctxErr, context.DeadlineExceeded) {
return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr)
}
return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr)
}
// isArchiveComplete reports whether doc's URL and DOM indicate a finished
// archive.ph snapshot. Both signals must agree: a URL transition alone is
// not enough (the /wip/ page can occasionally redirect to a stub before
// content lands), and a DOM marker alone is not enough (the front page's
// own markup overlaps slightly).
func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool {
current, err := url.Parse(doc.URL())
if err != nil {
return false
}
if !isFinalSnapshotURL(current, endpoint) {
return false
}
return hasCompletionMarker(doc)
}
// isFinalSnapshotURL reports whether u looks like a finished archive.ph
// snapshot URL. The hostname must match the endpoint and the path must
// match archivedIDPattern. /wip/, /submit and the front page are rejected.
func isFinalSnapshotURL(u, endpoint *url.URL) bool {
if u == nil || endpoint == nil {
return false
}
if u.Hostname() != endpoint.Hostname() {
// A redirect off-host (e.g. to the originally-archived URL) is
// unusual for archive.ph but if it happened we'd accept it: the
// snapshot was clearly produced and the caller asked us to land
// somewhere useful.
return u.Hostname() != ""
}
for _, prefix := range pendingPathPatterns {
if strings.HasPrefix(u.Path, prefix) {
return false
}
}
if u.Path == "" || u.Path == "/" {
return false
}
return archivedIDPattern.MatchString(u.Path)
}
// hasCompletionMarker reports whether doc has at least one of the known
// archive.ph completion DOM markers.
func hasCompletionMarker(doc extractor.Node) bool {
for _, sel := range completionSelectors {
if n := doc.SelectFirst(sel); n != nil {
return true
}
}
return false
}
+451 -9
View File
@@ -1,13 +1,23 @@
package archive package archive
import ( import (
"context"
"errors"
"fmt"
"net/url"
"sync"
"sync/atomic"
"testing" "testing"
"time" "time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
) )
// --- Config validation ---------------------------------------------------
func TestConfig_Validate_Defaults(t *testing.T) { func TestConfig_Validate_Defaults(t *testing.T) {
c := Config{} c := Config{}.validate()
c = c.validate()
if c.Endpoint != "https://archive.ph" { if c.Endpoint != "https://archive.ph" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph") t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
@@ -15,23 +25,455 @@ func TestConfig_Validate_Defaults(t *testing.T) {
if c.Timeout == nil { if c.Timeout == nil {
t.Fatal("Timeout should not be nil after validate") t.Fatal("Timeout should not be nil after validate")
} }
if *c.Timeout != 1*time.Hour { if *c.Timeout != DefaultTimeout {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour) t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
}
if DefaultTimeout != 5*time.Minute {
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
} }
} }
func TestConfig_Validate_Preserves(t *testing.T) { func TestConfig_Validate_Preserves(t *testing.T) {
timeout := 5 * time.Minute timeout := 30 * time.Second
c := Config{ c := Config{
Endpoint: "https://archive.org", Endpoint: "https://archive.org",
Timeout: &timeout, Timeout: &timeout,
} }.validate()
c = c.validate()
if c.Endpoint != "https://archive.org" { if c.Endpoint != "https://archive.org" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org") t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
} }
if *c.Timeout != 5*time.Minute { if *c.Timeout != 30*time.Second {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute) t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
} }
} }
// --- URL validation ------------------------------------------------------
func TestIsFinalSnapshotURL(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
want bool
}{
{"front-page-empty", "https://archive.ph/", false},
{"front-page-bare", "https://archive.ph", false},
{"wip", "https://archive.ph/wip/abc12", false},
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
{"submit-bare", "https://archive.ph/submit", false},
{"short-id-too-short", "https://archive.ph/ab", false},
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
{"short-id-5chars", "https://archive.ph/i9KU2", true},
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
{"o-prefix", "https://archive.ph/o/i9KU2", true},
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
u, err := url.Parse(tc.raw)
if err != nil {
t.Fatalf("parse %q: %v", tc.raw, err)
}
got := isFinalSnapshotURL(u, endpoint)
if got != tc.want {
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
}
})
}
}
// --- DOM completion marker -----------------------------------------------
func TestHasCompletionMarker(t *testing.T) {
t.Run("no markers", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
if hasCompletionMarker(doc) {
t.Error("expected no completion marker on empty doc")
}
})
for _, sel := range completionSelectors {
sel := sel
t.Run("marker "+sel, func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
sel: {&extractortest.MockNode{}},
},
},
}
if !hasCompletionMarker(doc) {
t.Errorf("expected completion marker via %q", sel)
}
})
}
}
// --- Selector cascade ----------------------------------------------------
func TestFindURLInput_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[0]: {&extractortest.MockNode{}},
urlInputSelectors[1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != urlInputSelectors[0] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
}
})
t.Run("falls back through cascade", func(t *testing.T) {
// Only the LAST selector matches.
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, sel := findURLInput(doc)
if n != nil {
t.Error("expected nil node")
}
if sel != "" {
t.Errorf("selector = %q, want empty", sel)
}
})
}
func TestFindSubmitButton_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
submitButtonSelectors[0]: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != submitButtonSelectors[0] {
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
}
})
t.Run("falls back to button[type='submit']", func(t *testing.T) {
// Use a known later-in-list selector.
target := submitButtonSelectors[len(submitButtonSelectors)-1]
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
target: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != target {
t.Errorf("selector = %q, want %q", sel, target)
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, _ := findSubmitButton(doc)
if n != nil {
t.Error("expected nil node")
}
})
}
// --- Transient status detection -----------------------------------------
func TestIsTransientStatus(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{"nil", nil, false},
{"plain error", errors.New("oops"), false},
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := isTransientStatus(tc.err); got != tc.want {
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
}
})
}
}
// --- mutDoc: a Document whose URL + Children can be swapped under load --
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
// so the polling loop sees consistent values from another goroutine.
type mutDoc struct {
mu sync.Mutex
urlValue atomic.Value // string
children atomic.Value // map[string]extractor.Nodes
}
var _ extractor.Document = (*mutDoc)(nil)
func newMutDoc(initialURL string) *mutDoc {
d := &mutDoc{}
d.urlValue.Store(initialURL)
d.children.Store(map[string]extractor.Nodes{})
return d
}
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
d.mu.Lock()
defer d.mu.Unlock()
d.children.Store(c)
}
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
func (d *mutDoc) Refresh() error { return nil }
func (d *mutDoc) Close() error { return nil }
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
func (d *mutDoc) Content() (string, error) { return "", nil }
func (d *mutDoc) Text() (string, error) { return "", nil }
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
func (d *mutDoc) Type(_ string) error { return nil }
func (d *mutDoc) Click() error { return nil }
func (d *mutDoc) SetHidden(_ bool) error { return nil }
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
func (d *mutDoc) Select(selector string) extractor.Nodes {
c := d.children.Load().(map[string]extractor.Nodes)
return c[selector]
}
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
return d.Select(selector).First()
}
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
for _, n := range d.Select(selector) {
if err := fn(n); err != nil {
return err
}
}
return nil
}
// --- pollUntilArchived ---------------------------------------------------
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
// No completion markers; URL stays on /wip/.
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
}
}
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithCancel(context.Background())
// Cancel after a brief delay so the polling loop is already inside its
// select.
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.Canceled) {
t.Errorf("expected wrapped context.Canceled, got %v", err)
}
}
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// After a short delay, transition to a final URL but WITHOUT a DOM
// marker. Poll should keep waiting. Then add the marker.
go func() {
time.Sleep(40 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2")
// No marker yet — poll should still wait.
time.Sleep(60 * time.Millisecond)
doc.setChildren(map[string]extractor.Nodes{
"div#HEADER": {&extractortest.MockNode{}},
})
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err != nil {
t.Fatalf("expected nil after URL+marker transition, got %v", err)
}
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
}
}
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
// URL transitions to a final-looking path but the DOM never grows a
// completion marker. Poll should hit the deadline.
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
go func() {
time.Sleep(10 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
}()
err := pollUntilArchived(ctx, doc, endpoint)
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
}
}
// --- isArchiveComplete combination ---------------------------------------
func TestIsArchiveComplete(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
marker bool
want bool
}{
{"both ok", "https://archive.ph/i9KU2", true, true},
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
{"final url no marker", "https://archive.ph/i9KU2", false, false},
{"front page with marker", "https://archive.ph/", true, false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
children := map[string]extractor.Nodes{}
if tc.marker {
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
}
doc := &extractortest.MockDocument{
URLValue: tc.raw,
MockNode: extractortest.MockNode{Children: children},
}
got := isArchiveComplete(doc, endpoint)
if got != tc.want {
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
}
})
}
}
// --- Archive: selector cascade failure path ------------------------------
// Note: the full Archive() flow drives a live browser. We can still cover
// the "form selectors all missing" branch via a custom Browser that returns
// a mutDoc with no children — the URL/typing path doesn't run because the
// selector lookup fails first.
type emptyFormBrowser struct {
doc extractor.Document
}
func (b *emptyFormBrowser) Close() error { return nil }
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
return b.doc, nil
}
func TestArchive_SelectorMissing(t *testing.T) {
doc := &extractortest.MockDocument{
URLValue: "https://archive.ph/",
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
}
b := &emptyFormBrowser{doc: doc}
timeout := 200 * time.Millisecond
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
if err == nil {
t.Fatal("expected error when form selectors are missing")
}
if !errors.Is(err, ErrArchiveSelectorMissing) {
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
}
}
// --- helpers -------------------------------------------------------------
func mustParse(t *testing.T, raw string) *url.URL {
t.Helper()
u, err := url.Parse(raw)
if err != nil {
t.Fatalf("parse %q: %v", raw, err)
}
return u
}