From 45fa7c4e8f6de6d3c3f49afd77e0ecc7ffb11b03 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Fri, 15 May 2026 17:23:24 -0400 Subject: [PATCH] fix(archive): harden archive.ph submit/poll flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The archive.ph submission flow had several defects that caused Mort's summary fallback to return placeholder "Working..." pages instead of real archived content, or hang for the full timeout: - Context cancellation in the poll loop fell through to a final WaitForNetworkIdle and returned the doc as success. The function now returns a typed error (ErrArchiveIncomplete on deadline, wrapped ctx.Err() on caller cancel). - The poll only checked doc.URL() — if archive.ph's JS got wedged on /wip/, the loop spun until timeout. Completion now also requires a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only transitions don't satisfy the check. - The final URL is now validated against an alphanumeric ID pattern, rejecting /wip/, /submit, /newest/ and the front page. - 5-second blind sleep before polling replaced with a bounded WaitForNetworkIdle that short-circuits when already archived. - Form selectors now use a cascade (input[name='url'] → input[type='url'] → input.input-url → input[name='anyway'], and similar for the submit button) so a single archive.ph markup change doesn't kill the flow. Errors name which selectors were tried. - Default timeout lowered from 1 hour to 5 minutes (still overridable via context deadline). Exposed as DefaultTimeout. - Poll progress is now logged at slog.Info every 30s so production logs surface stuck flows. - Front-page 5xx now retries twice with 1s/4s backoff before failing. - New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing. - Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix), selector cascade, DOM completion detector, transient status classification, and ctx cancellation paths via a thread-safe mutating mock document. Full integration with a live browser remains hand-tested. Co-Authored-By: Claude Opus 4.7 (1M context) --- sites/archive/archive.go | 458 ++++++++++++++++++++++++++++----- sites/archive/archive_test.go | 460 +++++++++++++++++++++++++++++++++- 2 files changed, 840 insertions(+), 78 deletions(-) diff --git a/sites/archive/archive.go b/sites/archive/archive.go index 94e9fcc..bd8d15b 100644 --- a/sites/archive/archive.go +++ b/sites/archive/archive.go @@ -1,3 +1,17 @@ +// Package archive provides a thin wrapper around archive.ph (a.k.a. +// archive.today) for two operations: +// +// - IsArchived: check whether a target URL already has a snapshot, and +// return the snapshot Document if so. +// - Archive: submit a target URL to archive.ph, poll until the snapshot is +// complete (or the context is cancelled), and return the resulting +// Document. +// +// The submit flow is intentionally defensive: archive.ph occasionally rotates +// its form markup, its front page sometimes 5xx's, and the in-progress +// "/wip/" pages can hang indefinitely if their JS gets wedged. The +// implementation in this file documents and tests each of those failure modes +// rather than papering over them. package archive import ( @@ -6,26 +20,121 @@ import ( "fmt" "log/slog" "net/url" + "regexp" "strings" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" ) +// ErrArchiveIncomplete is returned when archive.ph never transitions away +// from the /wip/ (work-in-progress) or /submit placeholder pages within the +// configured timeout. Callers can errors.Is against this sentinel to +// distinguish "archive.ph is slow / wedged" from "we got cancelled". +var ErrArchiveIncomplete = errors.New("archive: archive.ph did not finish before timeout") + +// ErrArchiveSelectorMissing is returned when archive.ph's front-page submit +// form cannot be found by any of the known fallback selectors. This usually +// means archive.ph rotated its markup and the cascade in this file needs to +// be updated. +var ErrArchiveSelectorMissing = errors.New("archive: required submit-form element not found on archive.ph") + +// urlInputSelectors and submitButtonSelectors are tried in order when +// locating the submit form on archive.ph's front page. Updating one of +// these in response to archive.ph DOM churn should not require touching +// the rest of the file. +var urlInputSelectors = []string{ + "input[name='url']", + "input[type='url']", + "input.input-url", + "input[name='anyway']", +} + +var submitButtonSelectors = []string{ + "form#submiturl input[type='submit']", + "form#submiturl button[type='submit']", + "input[type='submit'][value*='save' i]", + "button[type='submit']", +} + +// completionSelectors are DOM markers that, when present, indicate the page +// is a finished archived snapshot rather than the /wip/ placeholder. +// archive.ph snapshots wrap the page in a header bar + share box; both +// vary slightly across snapshots so we accept any of them. +var completionSelectors = []string{ + "div#HEADER", + "#HEADER", + "div[id^='SHARE']", + "#SHARE", + "div.TEXT-BLOCK", + ".TEXT-BLOCK", +} + +// archivedIDPattern matches a final archive.ph snapshot URL path. +// archive.ph identifiers are short alphanumeric codes (typically 5+ chars) +// and the snapshot URL is either: +// +// https://archive.ph/ +// https://archive.ph// +// https://archive.ph/o/ +// https://archive.ph/o// +// +// The pattern matches the path leading character set; callers should also +// check the hostname matches archive.ph (or whatever endpoint was configured). +var archivedIDPattern = regexp.MustCompile(`^/(?:o/)?[A-Za-z0-9]{5,}(?:/|$)`) + +// pendingPathPatterns lists path prefixes that mean the snapshot is NOT +// finished yet — work-in-progress, the submit endpoint, or the lookup +// endpoint (/newest/) which redirects through to a snapshot URL. +var pendingPathPatterns = []string{"/wip/", "/submit", "/submit/", "/newest/", "/newest"} + +const ( + // DefaultTimeout is the default upper bound on a single Archive call. + // archive.ph normally finishes within seconds; the 5-minute ceiling is + // generous enough to cover slow targets while still surfacing wedged + // flows to the caller in a reasonable time. + DefaultTimeout = 5 * time.Minute + + // defaultPollInterval is how often the polling loop re-checks the + // document's URL and DOM for completion markers. archive.ph snapshots + // typically finish within seconds; a tight interval makes the call + // return promptly without measurable cost (a couple of DOM selectors + // against an already-open page). + defaultPollInterval = 1 * time.Second + + // defaultProgressLogInterval is how often the polling loop emits a + // slog.Info progress line so production logs surface stuck flows. + defaultProgressLogInterval = 30 * time.Second + + // defaultInitialIdleWait is the soft cap on how long we wait for the + // initial post-submit page to settle. Already-archived URLs typically + // redirect to the snapshot almost immediately; new submissions take + // longer and the poll loop picks up after this. + defaultInitialIdleWait = 8 * time.Second + + // frontPageRetries is the number of additional attempts to open the + // archive.ph front page when it returns a 5xx (their own infra + // occasionally hiccups). + frontPageRetries = 2 +) + +// frontPageBackoffs is the backoff schedule between front-page retries. +// len(frontPageBackoffs) must equal frontPageRetries. +var frontPageBackoffs = []time.Duration{1 * time.Second, 4 * time.Second} + type Config struct { // Endpoint is the archive endpoint to use. If empty, archive.ph will be used. Endpoint string // Timeout will, if set, cancel any Archive call after this duration. - // If nil, the default timeout of 1 hour will be used. - Timeout *time.Duration // Timeout for the request, defaults to 1 hour + // If nil, DefaultTimeout (5 minutes) is used. + Timeout *time.Duration } // validate validates the config and sets default values if necessary. func (c Config) validate() Config { - if c.Timeout == nil { - def := 1 * time.Hour + def := DefaultTimeout c.Timeout = &def } @@ -38,7 +147,13 @@ func (c Config) validate() Config { var DefaultConfig = Config{} -// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not. +// IsArchived checks if a url is archived. It returns the archived Document if +// it is archived, or (nil, nil) if archive.ph has no snapshot for it. +// +// Why: callers (e.g. Mort's summary system) want to avoid submitting an +// archive request when one already exists. +// What: opens archive.ph/newest/ and returns the resulting Document. +// Test: see archive_test.go TestIsArchived_* (mock-browser based). func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() u, err := url.Parse(target) @@ -52,13 +167,11 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri } uri := endpoint.JoinPath("/newest") - uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String() - slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint) + slog.Info("checking if url is archived", "url", uri.String(), "endpoint", endpoint) doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true}) - if err != nil { if errors.Is(err, extractor.ErrPageNotFound) { if doc != nil { @@ -82,19 +195,35 @@ func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target stri return doc, nil } +// IsArchived is a convenience wrapper around DefaultConfig.IsArchived. func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.IsArchived(ctx, b, target) } +// Archive submits target to archive.ph and polls until the snapshot is +// complete (transitioned away from /wip/ AND a known DOM completion marker +// is present), or the context is cancelled / the timeout fires. +// +// Why: when Mort's summary system gets bot-checked on the live site it +// falls back to reading the archive.ph snapshot. The previous implementation +// was happy to return mid-submission /wip/ pages as "success" (placeholder +// "Working..." pages with no real content) which made the fallback useless. +// What: opens archive.ph's front page, types the target URL into the submit +// form, clicks submit, and polls for completion. Returns a typed error if +// archive.ph doesn't finish in time so callers can errors.Is and degrade. +// Test: archive_test.go covers the URL validator, selector cascade, the +// completion detector, and the ctx-cancellation path. The full integration +// flow requires a live browser + archive.ph and is hand-tested. func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() - var cancel context.CancelFunc if c.Timeout != nil { + var cancel context.CancelFunc ctx, cancel = context.WithTimeout(ctx, *c.Timeout) - slog.Info("setting timeout", "timeout", *c.Timeout) + slog.Info("archive: setting timeout", "timeout", *c.Timeout) defer cancel() } + u, err := url.Parse(target) if err != nil { return nil, fmt.Errorf("invalid url: %w", err) @@ -105,87 +234,278 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) return nil, fmt.Errorf("invalid endpoint: %w", err) } - doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true}) - + doc, err := openArchiveFrontPage(ctx, b, c.Endpoint) if err != nil { - // On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare - // captcha) the page is kept open by AllowNonOKStatus so the caller - // can promote it. Return both the doc and the wrapped error. - if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { - return doc, fmt.Errorf("failed to open url: %w", err) - } - if doc != nil { - _ = doc.Close() - } - return nil, fmt.Errorf("failed to open url: %w", err) + return nil, err } - urlInput := doc.SelectFirst("input[name='url']") + // Fill and submit the form. doc has the page; any error past this point + // must close it. + urlInput, urlSelector := findURLInput(doc) if urlInput == nil { _ = doc.Close() - return nil, fmt.Errorf("failed to find url input element") + return nil, fmt.Errorf("%w: tried url-input selectors %v", ErrArchiveSelectorMissing, urlInputSelectors) } - err = urlInput.Type(u.String()) - if err != nil { + if err = urlInput.Type(u.String()); err != nil { _ = doc.Close() - return nil, fmt.Errorf("failed to type url: %w", err) + return nil, fmt.Errorf("failed to type url into %q: %w", urlSelector, err) } - submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]") + submitBtn, submitSelector := findSubmitButton(doc) if submitBtn == nil { _ = doc.Close() - return nil, fmt.Errorf("failed to find submit button") + return nil, fmt.Errorf("%w: tried submit-button selectors %v", ErrArchiveSelectorMissing, submitButtonSelectors) } - err = submitBtn.Click() - if err != nil { + if err = submitBtn.Click(); err != nil { _ = doc.Close() - return nil, fmt.Errorf("failed to click submit: %w", err) + return nil, fmt.Errorf("failed to click submit %q: %w", submitSelector, err) } - // wait for the page to load, but respect context cancellation - select { - case <-ctx.Done(): - slog.Debug("context done during initial wait", "err", ctx.Err()) + // Initial soft idle wait so the post-submit redirect has a chance to + // land before we start polling. Already-archived URLs short-circuit + // here; new submissions fall through to the polling loop. + initialWait := defaultInitialIdleWait + if err = doc.WaitForNetworkIdle(&initialWait); err != nil { + // Network-idle timing out is normal on archive.ph during a fresh + // submission (the /wip/ page polls itself). Don't treat it as + // fatal — let the polling loop decide. + slog.Debug("archive: initial WaitForNetworkIdle returned", "err", err) + } + + if err = ctx.Err(); err != nil { _ = doc.Close() - return nil, ctx.Err() - case <-time.After(5 * time.Second): - } - // now we are waiting for archive.ph to archive the page and redirect us to the archived page - // the way we can tell this is happening is by checking the url of the page periodically - // if the page path starts with /wip/ then we are still waiting - // also periodically refresh the page just in case - - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - keepGoing := true - for keepGoing { - select { - case <-ctx.Done(): - slog.Info("context done") - keepGoing = false - - case <-ticker.C: - archivedUrl, err := url.Parse(doc.URL()) - - if err != nil { - continue - } - - slog.Debug("checking url", "url", archivedUrl.String()) - // if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done - if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) { - keepGoing = false - break - } - } + return nil, err } - return doc, doc.WaitForNetworkIdle(nil) + // Poll until either (a) the page transitions to a finished snapshot, + // (b) the context is cancelled, or (c) the timeout fires (which also + // cancels ctx). + if err = pollUntilArchived(ctx, doc, endpoint); err != nil { + _ = doc.Close() + return nil, err + } + + // Final settle: best-effort wait for in-flight asset loads on the + // snapshot itself so a downstream Readability call sees stable DOM. + settle := 10 * time.Second + if err = doc.WaitForNetworkIdle(&settle); err != nil { + slog.Debug("archive: final WaitForNetworkIdle returned", "err", err) + } + + return doc, nil } +// Archive is a convenience wrapper around DefaultConfig.Archive. func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.Archive(ctx, b, target) } + +// openArchiveFrontPage opens the archive.ph front page, retrying up to +// frontPageRetries times on 5xx responses. ErrInvalidStatusCode with a +// non-5xx status (e.g. 403 + Cloudflare captcha) is returned immediately +// along with the open document, mirroring the IsArchived contract so a +// caller can promote it to an InteractiveBrowser. +// +// Why: archive.ph's own infrastructure occasionally serves 5xx during +// load spikes; a single retry generally clears it. +// What: calls Browser.Open with AllowNonOKStatus, retrying transient 5xx. +// Test: not unit-tested (would require a fake browser that produces 5xx +// then 200); behaviour anchored by the retry count + backoff constants. +func openArchiveFrontPage(ctx context.Context, b extractor.Browser, endpoint string) (extractor.Document, error) { + var lastErr error + for attempt := 0; attempt <= frontPageRetries; attempt++ { + if err := ctx.Err(); err != nil { + return nil, err + } + + doc, err := b.Open(ctx, endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true}) + if err == nil { + return doc, nil + } + + // ErrInvalidStatusCode with the doc kept open means the caller can + // promote it to interactive (captcha). We don't retry these — the + // underlying page is what the caller wants. + if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil { + if isTransientStatus(err) && attempt < frontPageRetries { + // 5xx — close, back off, retry. + _ = doc.Close() + lastErr = err + slog.Warn("archive: archive.ph returned transient status, retrying", "attempt", attempt+1, "err", err) + if !sleepOrCancel(ctx, frontPageBackoffs[attempt]) { + return nil, ctx.Err() + } + continue + } + return doc, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err) + } + + if doc != nil { + _ = doc.Close() + } + lastErr = err + // Don't retry on non-status errors (browser-level failures). + return nil, fmt.Errorf("failed to open archive endpoint %q: %w", endpoint, err) + } + return nil, fmt.Errorf("failed to open archive endpoint %q after %d retries: %w", endpoint, frontPageRetries, lastErr) +} + +// isTransientStatus reports whether err wraps an HTTP 5xx status from +// ErrInvalidStatusCode that we should retry. +func isTransientStatus(err error) bool { + if !errors.Is(err, extractor.ErrInvalidStatusCode) { + return false + } + // The wrapped error message has the form "invalid status code: ". + // Parse the trailing integer for the 5xx check. + msg := err.Error() + // Find the last space and parse what follows. + idx := strings.LastIndex(msg, " ") + if idx < 0 || idx == len(msg)-1 { + return false + } + tail := msg[idx+1:] + if len(tail) != 3 { + return false + } + return tail[0] == '5' +} + +// sleepOrCancel blocks for d, returning true if it slept the full duration +// and false if ctx was cancelled first. +func sleepOrCancel(ctx context.Context, d time.Duration) bool { + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + return false + case <-timer.C: + return true + } +} + +// findURLInput tries each selector in urlInputSelectors until one matches, +// returning the Node and the selector that produced it. +func findURLInput(doc extractor.Document) (extractor.Node, string) { + for _, sel := range urlInputSelectors { + if n := doc.SelectFirst(sel); n != nil { + return n, sel + } + } + return nil, "" +} + +// findSubmitButton tries each selector in submitButtonSelectors until one +// matches, returning the Node and the selector that produced it. +func findSubmitButton(doc extractor.Document) (extractor.Node, string) { + for _, sel := range submitButtonSelectors { + if n := doc.SelectFirst(sel); n != nil { + return n, sel + } + } + return nil, "" +} + +// pollUntilArchived watches doc until its URL transitions to a final +// archive.ph snapshot URL AND a known completion DOM marker is present. +// Returns ErrArchiveIncomplete if the context fires while still on /wip/ +// or /submit, and ctx.Err() if the context was cancelled by the caller for +// other reasons (deadline-exceeded surfaces as ErrArchiveIncomplete because +// it's almost always the configured Timeout firing). +func pollUntilArchived(ctx context.Context, doc extractor.Document, endpoint *url.URL) error { + ticker := time.NewTicker(defaultPollInterval) + defer ticker.Stop() + + progressTicker := time.NewTicker(defaultProgressLogInterval) + defer progressTicker.Stop() + + for { + // Check on entry as well, so a context that's already cancelled + // produces a useful error rather than a spurious "incomplete". + if err := ctx.Err(); err != nil { + return classifyPollError(err, doc) + } + + if isArchiveComplete(doc, endpoint) { + slog.Info("archive: snapshot complete", "url", doc.URL()) + return nil + } + + select { + case <-ctx.Done(): + return classifyPollError(ctx.Err(), doc) + case <-progressTicker.C: + slog.Info("archive: still waiting for archive.ph", "url", doc.URL()) + case <-ticker.C: + // fall through to top-of-loop completion check + } + } +} + +// classifyPollError maps a context error into either ErrArchiveIncomplete +// (when the doc is still on a /wip/ or /submit page and the timeout fired) +// or the underlying ctx error (when the caller cancelled for other reasons). +func classifyPollError(ctxErr error, doc extractor.Document) error { + if ctxErr == nil { + return nil + } + currentURL := doc.URL() + if errors.Is(ctxErr, context.DeadlineExceeded) { + return fmt.Errorf("%w (last url: %s): %w", ErrArchiveIncomplete, currentURL, ctxErr) + } + return fmt.Errorf("archive: cancelled while polling (last url: %s): %w", currentURL, ctxErr) +} + +// isArchiveComplete reports whether doc's URL and DOM indicate a finished +// archive.ph snapshot. Both signals must agree: a URL transition alone is +// not enough (the /wip/ page can occasionally redirect to a stub before +// content lands), and a DOM marker alone is not enough (the front page's +// own markup overlaps slightly). +func isArchiveComplete(doc extractor.Document, endpoint *url.URL) bool { + current, err := url.Parse(doc.URL()) + if err != nil { + return false + } + if !isFinalSnapshotURL(current, endpoint) { + return false + } + return hasCompletionMarker(doc) +} + +// isFinalSnapshotURL reports whether u looks like a finished archive.ph +// snapshot URL. The hostname must match the endpoint and the path must +// match archivedIDPattern. /wip/, /submit and the front page are rejected. +func isFinalSnapshotURL(u, endpoint *url.URL) bool { + if u == nil || endpoint == nil { + return false + } + if u.Hostname() != endpoint.Hostname() { + // A redirect off-host (e.g. to the originally-archived URL) is + // unusual for archive.ph but if it happened we'd accept it: the + // snapshot was clearly produced and the caller asked us to land + // somewhere useful. + return u.Hostname() != "" + } + for _, prefix := range pendingPathPatterns { + if strings.HasPrefix(u.Path, prefix) { + return false + } + } + if u.Path == "" || u.Path == "/" { + return false + } + return archivedIDPattern.MatchString(u.Path) +} + +// hasCompletionMarker reports whether doc has at least one of the known +// archive.ph completion DOM markers. +func hasCompletionMarker(doc extractor.Node) bool { + for _, sel := range completionSelectors { + if n := doc.SelectFirst(sel); n != nil { + return true + } + } + return false +} diff --git a/sites/archive/archive_test.go b/sites/archive/archive_test.go index d219530..4b42f0c 100644 --- a/sites/archive/archive_test.go +++ b/sites/archive/archive_test.go @@ -1,13 +1,23 @@ package archive import ( + "context" + "errors" + "fmt" + "net/url" + "sync" + "sync/atomic" "testing" "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + "gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest" ) +// --- Config validation --------------------------------------------------- + func TestConfig_Validate_Defaults(t *testing.T) { - c := Config{} - c = c.validate() + c := Config{}.validate() if c.Endpoint != "https://archive.ph" { t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph") @@ -15,23 +25,455 @@ func TestConfig_Validate_Defaults(t *testing.T) { if c.Timeout == nil { t.Fatal("Timeout should not be nil after validate") } - if *c.Timeout != 1*time.Hour { - t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour) + if *c.Timeout != DefaultTimeout { + t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout) + } + if DefaultTimeout != 5*time.Minute { + t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute) } } func TestConfig_Validate_Preserves(t *testing.T) { - timeout := 5 * time.Minute + timeout := 30 * time.Second c := Config{ Endpoint: "https://archive.org", Timeout: &timeout, - } - c = c.validate() + }.validate() if c.Endpoint != "https://archive.org" { t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org") } - if *c.Timeout != 5*time.Minute { - t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute) + if *c.Timeout != 30*time.Second { + t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second) } } + +// --- URL validation ------------------------------------------------------ + +func TestIsFinalSnapshotURL(t *testing.T) { + endpoint, _ := url.Parse("https://archive.ph") + + cases := []struct { + name string + raw string + want bool + }{ + {"front-page-empty", "https://archive.ph/", false}, + {"front-page-bare", "https://archive.ph", false}, + {"wip", "https://archive.ph/wip/abc12", false}, + {"submit-trailing", "https://archive.ph/submit/?url=foo", false}, + {"submit-bare", "https://archive.ph/submit", false}, + {"short-id-too-short", "https://archive.ph/ab", false}, + {"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id + {"short-id-5chars", "https://archive.ph/i9KU2", true}, + {"short-id-7chars", "https://archive.ph/aBcD9E2", true}, + {"o-prefix", "https://archive.ph/o/i9KU2", true}, + {"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true}, + {"id-with-source", "https://archive.ph/i9KU2/https://example.com", true}, + {"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + u, err := url.Parse(tc.raw) + if err != nil { + t.Fatalf("parse %q: %v", tc.raw, err) + } + got := isFinalSnapshotURL(u, endpoint) + if got != tc.want { + t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want) + } + }) + } +} + +// --- DOM completion marker ----------------------------------------------- + +func TestHasCompletionMarker(t *testing.T) { + t.Run("no markers", func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{}, + }, + } + if hasCompletionMarker(doc) { + t.Error("expected no completion marker on empty doc") + } + }) + + for _, sel := range completionSelectors { + sel := sel + t.Run("marker "+sel, func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + sel: {&extractortest.MockNode{}}, + }, + }, + } + if !hasCompletionMarker(doc) { + t.Errorf("expected completion marker via %q", sel) + } + }) + } +} + +// --- Selector cascade ---------------------------------------------------- + +func TestFindURLInput_Cascade(t *testing.T) { + t.Run("first selector wins", func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + urlInputSelectors[0]: {&extractortest.MockNode{}}, + urlInputSelectors[1]: {&extractortest.MockNode{}}, + }, + }, + } + n, sel := findURLInput(doc) + if n == nil { + t.Fatal("expected node") + } + if sel != urlInputSelectors[0] { + t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0]) + } + }) + + t.Run("falls back through cascade", func(t *testing.T) { + // Only the LAST selector matches. + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}}, + }, + }, + } + n, sel := findURLInput(doc) + if n == nil { + t.Fatal("expected node from last fallback") + } + if sel != urlInputSelectors[len(urlInputSelectors)-1] { + t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1]) + } + }) + + t.Run("all selectors miss", func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{}, + }, + } + n, sel := findURLInput(doc) + if n != nil { + t.Error("expected nil node") + } + if sel != "" { + t.Errorf("selector = %q, want empty", sel) + } + }) +} + +func TestFindSubmitButton_Cascade(t *testing.T) { + t.Run("first selector wins", func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + submitButtonSelectors[0]: {&extractortest.MockNode{}}, + }, + }, + } + n, sel := findSubmitButton(doc) + if n == nil { + t.Fatal("expected node") + } + if sel != submitButtonSelectors[0] { + t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0]) + } + }) + + t.Run("falls back to button[type='submit']", func(t *testing.T) { + // Use a known later-in-list selector. + target := submitButtonSelectors[len(submitButtonSelectors)-1] + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + target: {&extractortest.MockNode{}}, + }, + }, + } + n, sel := findSubmitButton(doc) + if n == nil { + t.Fatal("expected node from last fallback") + } + if sel != target { + t.Errorf("selector = %q, want %q", sel, target) + } + }) + + t.Run("all selectors miss", func(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{}, + }, + } + n, _ := findSubmitButton(doc) + if n != nil { + t.Error("expected nil node") + } + }) +} + +// --- Transient status detection ----------------------------------------- + +func TestIsTransientStatus(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"plain error", errors.New("oops"), false}, + {"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true}, + {"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true}, + {"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true}, + {"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false}, + {"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false}, + {"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isTransientStatus(tc.err); got != tc.want { + t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} + +// --- mutDoc: a Document whose URL + Children can be swapped under load -- + +// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex +// so the polling loop sees consistent values from another goroutine. +type mutDoc struct { + mu sync.Mutex + urlValue atomic.Value // string + children atomic.Value // map[string]extractor.Nodes +} + +var _ extractor.Document = (*mutDoc)(nil) + +func newMutDoc(initialURL string) *mutDoc { + d := &mutDoc{} + d.urlValue.Store(initialURL) + d.children.Store(map[string]extractor.Nodes{}) + return d +} + +func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) } +func (d *mutDoc) setChildren(c map[string]extractor.Nodes) { + d.mu.Lock() + defer d.mu.Unlock() + d.children.Store(c) +} + +func (d *mutDoc) URL() string { return d.urlValue.Load().(string) } +func (d *mutDoc) Refresh() error { return nil } +func (d *mutDoc) Close() error { return nil } +func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil } +func (d *mutDoc) Content() (string, error) { return "", nil } +func (d *mutDoc) Text() (string, error) { return "", nil } +func (d *mutDoc) Attr(_ string) (string, error) { return "", nil } +func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil } +func (d *mutDoc) Type(_ string) error { return nil } +func (d *mutDoc) Click() error { return nil } +func (d *mutDoc) SetHidden(_ bool) error { return nil } +func (d *mutDoc) SetAttribute(_, _ string) error { return nil } + +func (d *mutDoc) Select(selector string) extractor.Nodes { + c := d.children.Load().(map[string]extractor.Nodes) + return c[selector] +} + +func (d *mutDoc) SelectFirst(selector string) extractor.Node { + return d.Select(selector).First() +} + +func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error { + for _, n := range d.Select(selector) { + if err := fn(n); err != nil { + return err + } + } + return nil +} + +// --- pollUntilArchived --------------------------------------------------- + +func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) { + endpoint, _ := url.Parse("https://archive.ph") + doc := newMutDoc("https://archive.ph/wip/abc12") + // No completion markers; URL stays on /wip/. + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + + err := pollUntilArchived(ctx, doc, endpoint) + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrArchiveIncomplete) { + t.Errorf("expected ErrArchiveIncomplete, got %v", err) + } + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("expected wrapped DeadlineExceeded, got %v", err) + } +} + +func TestPollUntilArchived_CallerCancelled(t *testing.T) { + endpoint, _ := url.Parse("https://archive.ph") + doc := newMutDoc("https://archive.ph/wip/abc12") + + ctx, cancel := context.WithCancel(context.Background()) + // Cancel after a brief delay so the polling loop is already inside its + // select. + go func() { + time.Sleep(20 * time.Millisecond) + cancel() + }() + + err := pollUntilArchived(ctx, doc, endpoint) + if err == nil { + t.Fatal("expected error, got nil") + } + if errors.Is(err, ErrArchiveIncomplete) { + t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err) + } + if !errors.Is(err, context.Canceled) { + t.Errorf("expected wrapped context.Canceled, got %v", err) + } +} + +func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) { + endpoint, _ := url.Parse("https://archive.ph") + doc := newMutDoc("https://archive.ph/wip/abc12") + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + // After a short delay, transition to a final URL but WITHOUT a DOM + // marker. Poll should keep waiting. Then add the marker. + go func() { + time.Sleep(40 * time.Millisecond) + doc.setURL("https://archive.ph/i9KU2") + // No marker yet — poll should still wait. + time.Sleep(60 * time.Millisecond) + doc.setChildren(map[string]extractor.Nodes{ + "div#HEADER": {&extractortest.MockNode{}}, + }) + }() + + err := pollUntilArchived(ctx, doc, endpoint) + if err != nil { + t.Fatalf("expected nil after URL+marker transition, got %v", err) + } + if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) { + t.Errorf("final URL %q does not look like a snapshot", doc.URL()) + } +} + +func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) { + // URL transitions to a final-looking path but the DOM never grows a + // completion marker. Poll should hit the deadline. + endpoint, _ := url.Parse("https://archive.ph") + doc := newMutDoc("https://archive.ph/wip/abc12") + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + + go func() { + time.Sleep(10 * time.Millisecond) + doc.setURL("https://archive.ph/i9KU2") // looks final but no marker + }() + + err := pollUntilArchived(ctx, doc, endpoint) + if !errors.Is(err, ErrArchiveIncomplete) { + t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err) + } +} + +// --- isArchiveComplete combination --------------------------------------- + +func TestIsArchiveComplete(t *testing.T) { + endpoint, _ := url.Parse("https://archive.ph") + + cases := []struct { + name string + raw string + marker bool + want bool + }{ + {"both ok", "https://archive.ph/i9KU2", true, true}, + {"wip url with marker", "https://archive.ph/wip/abc12", true, false}, + {"final url no marker", "https://archive.ph/i9KU2", false, false}, + {"front page with marker", "https://archive.ph/", true, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + children := map[string]extractor.Nodes{} + if tc.marker { + children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}} + } + doc := &extractortest.MockDocument{ + URLValue: tc.raw, + MockNode: extractortest.MockNode{Children: children}, + } + got := isArchiveComplete(doc, endpoint) + if got != tc.want { + t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want) + } + }) + } +} + +// --- Archive: selector cascade failure path ------------------------------ + +// Note: the full Archive() flow drives a live browser. We can still cover +// the "form selectors all missing" branch via a custom Browser that returns +// a mutDoc with no children — the URL/typing path doesn't run because the +// selector lookup fails first. + +type emptyFormBrowser struct { + doc extractor.Document +} + +func (b *emptyFormBrowser) Close() error { return nil } +func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) { + return b.doc, nil +} + +func TestArchive_SelectorMissing(t *testing.T) { + doc := &extractortest.MockDocument{ + URLValue: "https://archive.ph/", + MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}}, + } + b := &emptyFormBrowser{doc: doc} + + timeout := 200 * time.Millisecond + _, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com") + if err == nil { + t.Fatal("expected error when form selectors are missing") + } + if !errors.Is(err, ErrArchiveSelectorMissing) { + t.Errorf("expected ErrArchiveSelectorMissing, got %v", err) + } +} + +// --- helpers ------------------------------------------------------------- + +func mustParse(t *testing.T, raw string) *url.URL { + t.Helper() + u, err := url.Parse(raw) + if err != nil { + t.Fatalf("parse %q: %v", raw, err) + } + return u +} -- 2.52.0