Files
go-extractor/sites/archive/archive_test.go
T
steve 45fa7c4e8f
CI / build (pull_request) Successful in 1m5s
CI / vet (pull_request) Successful in 1m26s
CI / test (pull_request) Successful in 1m27s
fix(archive): harden archive.ph submit/poll flow
The archive.ph submission flow had several defects that caused Mort's
summary fallback to return placeholder "Working..." pages instead of
real archived content, or hang for the full timeout:

- Context cancellation in the poll loop fell through to a final
  WaitForNetworkIdle and returned the doc as success. The function now
  returns a typed error (ErrArchiveIncomplete on deadline, wrapped
  ctx.Err() on caller cancel).
- The poll only checked doc.URL() — if archive.ph's JS got wedged on
  /wip/<id>, the loop spun until timeout. Completion now also requires
  a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only
  transitions don't satisfy the check.
- The final URL is now validated against an alphanumeric ID pattern,
  rejecting /wip/, /submit, /newest/ and the front page.
- 5-second blind sleep before polling replaced with a bounded
  WaitForNetworkIdle that short-circuits when already archived.
- Form selectors now use a cascade (input[name='url'] →
  input[type='url'] → input.input-url → input[name='anyway'], and
  similar for the submit button) so a single archive.ph markup change
  doesn't kill the flow. Errors name which selectors were tried.
- Default timeout lowered from 1 hour to 5 minutes (still overridable
  via context deadline). Exposed as DefaultTimeout.
- Poll progress is now logged at slog.Info every 30s so production logs
  surface stuck flows.
- Front-page 5xx now retries twice with 1s/4s backoff before failing.
- New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing.
- Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix),
  selector cascade, DOM completion detector, transient status
  classification, and ctx cancellation paths via a thread-safe mutating
  mock document. Full integration with a live browser remains hand-tested.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 17:23:24 -04:00

480 lines
14 KiB
Go

package archive
import (
"context"
"errors"
"fmt"
"net/url"
"sync"
"sync/atomic"
"testing"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
)
// --- Config validation ---------------------------------------------------
func TestConfig_Validate_Defaults(t *testing.T) {
c := Config{}.validate()
if c.Endpoint != "https://archive.ph" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
}
if c.Timeout == nil {
t.Fatal("Timeout should not be nil after validate")
}
if *c.Timeout != DefaultTimeout {
t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
}
if DefaultTimeout != 5*time.Minute {
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
}
}
func TestConfig_Validate_Preserves(t *testing.T) {
timeout := 30 * time.Second
c := Config{
Endpoint: "https://archive.org",
Timeout: &timeout,
}.validate()
if c.Endpoint != "https://archive.org" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
}
if *c.Timeout != 30*time.Second {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
}
}
// --- URL validation ------------------------------------------------------
func TestIsFinalSnapshotURL(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
want bool
}{
{"front-page-empty", "https://archive.ph/", false},
{"front-page-bare", "https://archive.ph", false},
{"wip", "https://archive.ph/wip/abc12", false},
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
{"submit-bare", "https://archive.ph/submit", false},
{"short-id-too-short", "https://archive.ph/ab", false},
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
{"short-id-5chars", "https://archive.ph/i9KU2", true},
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
{"o-prefix", "https://archive.ph/o/i9KU2", true},
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
u, err := url.Parse(tc.raw)
if err != nil {
t.Fatalf("parse %q: %v", tc.raw, err)
}
got := isFinalSnapshotURL(u, endpoint)
if got != tc.want {
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
}
})
}
}
// --- DOM completion marker -----------------------------------------------
func TestHasCompletionMarker(t *testing.T) {
t.Run("no markers", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
if hasCompletionMarker(doc) {
t.Error("expected no completion marker on empty doc")
}
})
for _, sel := range completionSelectors {
sel := sel
t.Run("marker "+sel, func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
sel: {&extractortest.MockNode{}},
},
},
}
if !hasCompletionMarker(doc) {
t.Errorf("expected completion marker via %q", sel)
}
})
}
}
// --- Selector cascade ----------------------------------------------------
func TestFindURLInput_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[0]: {&extractortest.MockNode{}},
urlInputSelectors[1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != urlInputSelectors[0] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
}
})
t.Run("falls back through cascade", func(t *testing.T) {
// Only the LAST selector matches.
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, sel := findURLInput(doc)
if n != nil {
t.Error("expected nil node")
}
if sel != "" {
t.Errorf("selector = %q, want empty", sel)
}
})
}
func TestFindSubmitButton_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
submitButtonSelectors[0]: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != submitButtonSelectors[0] {
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
}
})
t.Run("falls back to button[type='submit']", func(t *testing.T) {
// Use a known later-in-list selector.
target := submitButtonSelectors[len(submitButtonSelectors)-1]
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
target: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != target {
t.Errorf("selector = %q, want %q", sel, target)
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, _ := findSubmitButton(doc)
if n != nil {
t.Error("expected nil node")
}
})
}
// --- Transient status detection -----------------------------------------
func TestIsTransientStatus(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{"nil", nil, false},
{"plain error", errors.New("oops"), false},
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := isTransientStatus(tc.err); got != tc.want {
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
}
})
}
}
// --- mutDoc: a Document whose URL + Children can be swapped under load --
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
// so the polling loop sees consistent values from another goroutine.
type mutDoc struct {
mu sync.Mutex
urlValue atomic.Value // string
children atomic.Value // map[string]extractor.Nodes
}
var _ extractor.Document = (*mutDoc)(nil)
func newMutDoc(initialURL string) *mutDoc {
d := &mutDoc{}
d.urlValue.Store(initialURL)
d.children.Store(map[string]extractor.Nodes{})
return d
}
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
d.mu.Lock()
defer d.mu.Unlock()
d.children.Store(c)
}
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
func (d *mutDoc) Refresh() error { return nil }
func (d *mutDoc) Close() error { return nil }
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
func (d *mutDoc) Content() (string, error) { return "", nil }
func (d *mutDoc) Text() (string, error) { return "", nil }
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
func (d *mutDoc) Type(_ string) error { return nil }
func (d *mutDoc) Click() error { return nil }
func (d *mutDoc) SetHidden(_ bool) error { return nil }
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
func (d *mutDoc) Select(selector string) extractor.Nodes {
c := d.children.Load().(map[string]extractor.Nodes)
return c[selector]
}
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
return d.Select(selector).First()
}
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
for _, n := range d.Select(selector) {
if err := fn(n); err != nil {
return err
}
}
return nil
}
// --- pollUntilArchived ---------------------------------------------------
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
// No completion markers; URL stays on /wip/.
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
}
}
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithCancel(context.Background())
// Cancel after a brief delay so the polling loop is already inside its
// select.
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.Canceled) {
t.Errorf("expected wrapped context.Canceled, got %v", err)
}
}
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// After a short delay, transition to a final URL but WITHOUT a DOM
// marker. Poll should keep waiting. Then add the marker.
go func() {
time.Sleep(40 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2")
// No marker yet — poll should still wait.
time.Sleep(60 * time.Millisecond)
doc.setChildren(map[string]extractor.Nodes{
"div#HEADER": {&extractortest.MockNode{}},
})
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err != nil {
t.Fatalf("expected nil after URL+marker transition, got %v", err)
}
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
}
}
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
// URL transitions to a final-looking path but the DOM never grows a
// completion marker. Poll should hit the deadline.
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
go func() {
time.Sleep(10 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
}()
err := pollUntilArchived(ctx, doc, endpoint)
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
}
}
// --- isArchiveComplete combination ---------------------------------------
func TestIsArchiveComplete(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
marker bool
want bool
}{
{"both ok", "https://archive.ph/i9KU2", true, true},
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
{"final url no marker", "https://archive.ph/i9KU2", false, false},
{"front page with marker", "https://archive.ph/", true, false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
children := map[string]extractor.Nodes{}
if tc.marker {
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
}
doc := &extractortest.MockDocument{
URLValue: tc.raw,
MockNode: extractortest.MockNode{Children: children},
}
got := isArchiveComplete(doc, endpoint)
if got != tc.want {
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
}
})
}
}
// --- Archive: selector cascade failure path ------------------------------
// Note: the full Archive() flow drives a live browser. We can still cover
// the "form selectors all missing" branch via a custom Browser that returns
// a mutDoc with no children — the URL/typing path doesn't run because the
// selector lookup fails first.
type emptyFormBrowser struct {
doc extractor.Document
}
func (b *emptyFormBrowser) Close() error { return nil }
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
return b.doc, nil
}
func TestArchive_SelectorMissing(t *testing.T) {
doc := &extractortest.MockDocument{
URLValue: "https://archive.ph/",
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
}
b := &emptyFormBrowser{doc: doc}
timeout := 200 * time.Millisecond
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
if err == nil {
t.Fatal("expected error when form selectors are missing")
}
if !errors.Is(err, ErrArchiveSelectorMissing) {
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
}
}
// --- helpers -------------------------------------------------------------
func mustParse(t *testing.T, raw string) *url.URL {
t.Helper()
u, err := url.Parse(raw)
if err != nil {
t.Fatalf("parse %q: %v", raw, err)
}
return u
}