fix(archive): harden archive.ph submit/poll flow
CI / build (pull_request) Successful in 1m5s
CI / vet (pull_request) Successful in 1m26s
CI / test (pull_request) Successful in 1m27s

The archive.ph submission flow had several defects that caused Mort's
summary fallback to return placeholder "Working..." pages instead of
real archived content, or hang for the full timeout:

- Context cancellation in the poll loop fell through to a final
  WaitForNetworkIdle and returned the doc as success. The function now
  returns a typed error (ErrArchiveIncomplete on deadline, wrapped
  ctx.Err() on caller cancel).
- The poll only checked doc.URL() — if archive.ph's JS got wedged on
  /wip/<id>, the loop spun until timeout. Completion now also requires
  a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only
  transitions don't satisfy the check.
- The final URL is now validated against an alphanumeric ID pattern,
  rejecting /wip/, /submit, /newest/ and the front page.
- 5-second blind sleep before polling replaced with a bounded
  WaitForNetworkIdle that short-circuits when already archived.
- Form selectors now use a cascade (input[name='url'] →
  input[type='url'] → input.input-url → input[name='anyway'], and
  similar for the submit button) so a single archive.ph markup change
  doesn't kill the flow. Errors name which selectors were tried.
- Default timeout lowered from 1 hour to 5 minutes (still overridable
  via context deadline). Exposed as DefaultTimeout.
- Poll progress is now logged at slog.Info every 30s so production logs
  surface stuck flows.
- Front-page 5xx now retries twice with 1s/4s backoff before failing.
- New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing.
- Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix),
  selector cascade, DOM completion detector, transient status
  classification, and ctx cancellation paths via a thread-safe mutating
  mock document. Full integration with a live browser remains hand-tested.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-15 17:23:24 -04:00
parent 3b38637e56
commit 45fa7c4e8f
2 changed files with 840 additions and 78 deletions
+451 -9
View File
@@ -1,13 +1,23 @@
package archive
import (
"context"
"errors"
"fmt"
"net/url"
"sync"
"sync/atomic"
"testing"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
)
// --- Config validation ---------------------------------------------------
func TestConfig_Validate_Defaults(t *testing.T) {
c := Config{}
c = c.validate()
c := Config{}.validate()
if c.Endpoint != "https://archive.ph" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
@@ -15,23 +25,455 @@ func TestConfig_Validate_Defaults(t *testing.T) {
if c.Timeout == nil {
t.Fatal("Timeout should not be nil after validate")
}
if *c.Timeout != 1*time.Hour {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
if *c.Timeout != DefaultTimeout {
t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
}
if DefaultTimeout != 5*time.Minute {
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
}
}
func TestConfig_Validate_Preserves(t *testing.T) {
timeout := 5 * time.Minute
timeout := 30 * time.Second
c := Config{
Endpoint: "https://archive.org",
Timeout: &timeout,
}
c = c.validate()
}.validate()
if c.Endpoint != "https://archive.org" {
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
}
if *c.Timeout != 5*time.Minute {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
if *c.Timeout != 30*time.Second {
t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
}
}
// --- URL validation ------------------------------------------------------
func TestIsFinalSnapshotURL(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
want bool
}{
{"front-page-empty", "https://archive.ph/", false},
{"front-page-bare", "https://archive.ph", false},
{"wip", "https://archive.ph/wip/abc12", false},
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
{"submit-bare", "https://archive.ph/submit", false},
{"short-id-too-short", "https://archive.ph/ab", false},
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
{"short-id-5chars", "https://archive.ph/i9KU2", true},
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
{"o-prefix", "https://archive.ph/o/i9KU2", true},
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
u, err := url.Parse(tc.raw)
if err != nil {
t.Fatalf("parse %q: %v", tc.raw, err)
}
got := isFinalSnapshotURL(u, endpoint)
if got != tc.want {
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
}
})
}
}
// --- DOM completion marker -----------------------------------------------
func TestHasCompletionMarker(t *testing.T) {
t.Run("no markers", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
if hasCompletionMarker(doc) {
t.Error("expected no completion marker on empty doc")
}
})
for _, sel := range completionSelectors {
sel := sel
t.Run("marker "+sel, func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
sel: {&extractortest.MockNode{}},
},
},
}
if !hasCompletionMarker(doc) {
t.Errorf("expected completion marker via %q", sel)
}
})
}
}
// --- Selector cascade ----------------------------------------------------
func TestFindURLInput_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[0]: {&extractortest.MockNode{}},
urlInputSelectors[1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != urlInputSelectors[0] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
}
})
t.Run("falls back through cascade", func(t *testing.T) {
// Only the LAST selector matches.
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
},
},
}
n, sel := findURLInput(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, sel := findURLInput(doc)
if n != nil {
t.Error("expected nil node")
}
if sel != "" {
t.Errorf("selector = %q, want empty", sel)
}
})
}
func TestFindSubmitButton_Cascade(t *testing.T) {
t.Run("first selector wins", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
submitButtonSelectors[0]: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node")
}
if sel != submitButtonSelectors[0] {
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
}
})
t.Run("falls back to button[type='submit']", func(t *testing.T) {
// Use a known later-in-list selector.
target := submitButtonSelectors[len(submitButtonSelectors)-1]
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
target: {&extractortest.MockNode{}},
},
},
}
n, sel := findSubmitButton(doc)
if n == nil {
t.Fatal("expected node from last fallback")
}
if sel != target {
t.Errorf("selector = %q, want %q", sel, target)
}
})
t.Run("all selectors miss", func(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
n, _ := findSubmitButton(doc)
if n != nil {
t.Error("expected nil node")
}
})
}
// --- Transient status detection -----------------------------------------
func TestIsTransientStatus(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{"nil", nil, false},
{"plain error", errors.New("oops"), false},
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := isTransientStatus(tc.err); got != tc.want {
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
}
})
}
}
// --- mutDoc: a Document whose URL + Children can be swapped under load --
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
// so the polling loop sees consistent values from another goroutine.
type mutDoc struct {
mu sync.Mutex
urlValue atomic.Value // string
children atomic.Value // map[string]extractor.Nodes
}
var _ extractor.Document = (*mutDoc)(nil)
func newMutDoc(initialURL string) *mutDoc {
d := &mutDoc{}
d.urlValue.Store(initialURL)
d.children.Store(map[string]extractor.Nodes{})
return d
}
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
d.mu.Lock()
defer d.mu.Unlock()
d.children.Store(c)
}
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
func (d *mutDoc) Refresh() error { return nil }
func (d *mutDoc) Close() error { return nil }
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
func (d *mutDoc) Content() (string, error) { return "", nil }
func (d *mutDoc) Text() (string, error) { return "", nil }
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
func (d *mutDoc) Type(_ string) error { return nil }
func (d *mutDoc) Click() error { return nil }
func (d *mutDoc) SetHidden(_ bool) error { return nil }
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
func (d *mutDoc) Select(selector string) extractor.Nodes {
c := d.children.Load().(map[string]extractor.Nodes)
return c[selector]
}
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
return d.Select(selector).First()
}
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
for _, n := range d.Select(selector) {
if err := fn(n); err != nil {
return err
}
}
return nil
}
// --- pollUntilArchived ---------------------------------------------------
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
// No completion markers; URL stays on /wip/.
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
}
}
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithCancel(context.Background())
// Cancel after a brief delay so the polling loop is already inside its
// select.
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err == nil {
t.Fatal("expected error, got nil")
}
if errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
}
if !errors.Is(err, context.Canceled) {
t.Errorf("expected wrapped context.Canceled, got %v", err)
}
}
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// After a short delay, transition to a final URL but WITHOUT a DOM
// marker. Poll should keep waiting. Then add the marker.
go func() {
time.Sleep(40 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2")
// No marker yet — poll should still wait.
time.Sleep(60 * time.Millisecond)
doc.setChildren(map[string]extractor.Nodes{
"div#HEADER": {&extractortest.MockNode{}},
})
}()
err := pollUntilArchived(ctx, doc, endpoint)
if err != nil {
t.Fatalf("expected nil after URL+marker transition, got %v", err)
}
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
}
}
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
// URL transitions to a final-looking path but the DOM never grows a
// completion marker. Poll should hit the deadline.
endpoint, _ := url.Parse("https://archive.ph")
doc := newMutDoc("https://archive.ph/wip/abc12")
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
go func() {
time.Sleep(10 * time.Millisecond)
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
}()
err := pollUntilArchived(ctx, doc, endpoint)
if !errors.Is(err, ErrArchiveIncomplete) {
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
}
}
// --- isArchiveComplete combination ---------------------------------------
func TestIsArchiveComplete(t *testing.T) {
endpoint, _ := url.Parse("https://archive.ph")
cases := []struct {
name string
raw string
marker bool
want bool
}{
{"both ok", "https://archive.ph/i9KU2", true, true},
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
{"final url no marker", "https://archive.ph/i9KU2", false, false},
{"front page with marker", "https://archive.ph/", true, false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
children := map[string]extractor.Nodes{}
if tc.marker {
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
}
doc := &extractortest.MockDocument{
URLValue: tc.raw,
MockNode: extractortest.MockNode{Children: children},
}
got := isArchiveComplete(doc, endpoint)
if got != tc.want {
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
}
})
}
}
// --- Archive: selector cascade failure path ------------------------------
// Note: the full Archive() flow drives a live browser. We can still cover
// the "form selectors all missing" branch via a custom Browser that returns
// a mutDoc with no children — the URL/typing path doesn't run because the
// selector lookup fails first.
type emptyFormBrowser struct {
doc extractor.Document
}
func (b *emptyFormBrowser) Close() error { return nil }
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
return b.doc, nil
}
func TestArchive_SelectorMissing(t *testing.T) {
doc := &extractortest.MockDocument{
URLValue: "https://archive.ph/",
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
}
b := &emptyFormBrowser{doc: doc}
timeout := 200 * time.Millisecond
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
if err == nil {
t.Fatal("expected error when form selectors are missing")
}
if !errors.Is(err, ErrArchiveSelectorMissing) {
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
}
}
// --- helpers -------------------------------------------------------------
func mustParse(t *testing.T, raw string) *url.URL {
t.Helper()
u, err := url.Parse(raw)
if err != nil {
t.Fatalf("parse %q: %v", raw, err)
}
return u
}