fix(archive): harden archive.ph submit/poll flow
The archive.ph submission flow had several defects that caused Mort's summary fallback to return placeholder "Working..." pages instead of real archived content, or hang for the full timeout: - Context cancellation in the poll loop fell through to a final WaitForNetworkIdle and returned the doc as success. The function now returns a typed error (ErrArchiveIncomplete on deadline, wrapped ctx.Err() on caller cancel). - The poll only checked doc.URL() — if archive.ph's JS got wedged on /wip/<id>, the loop spun until timeout. Completion now also requires a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only transitions don't satisfy the check. - The final URL is now validated against an alphanumeric ID pattern, rejecting /wip/, /submit, /newest/ and the front page. - 5-second blind sleep before polling replaced with a bounded WaitForNetworkIdle that short-circuits when already archived. - Form selectors now use a cascade (input[name='url'] → input[type='url'] → input.input-url → input[name='anyway'], and similar for the submit button) so a single archive.ph markup change doesn't kill the flow. Errors name which selectors were tried. - Default timeout lowered from 1 hour to 5 minutes (still overridable via context deadline). Exposed as DefaultTimeout. - Poll progress is now logged at slog.Info every 30s so production logs surface stuck flows. - Front-page 5xx now retries twice with 1s/4s backoff before failing. - New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing. - Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix), selector cascade, DOM completion detector, transient status classification, and ctx cancellation paths via a thread-safe mutating mock document. Full integration with a live browser remains hand-tested. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,13 +1,23 @@
|
||||
package archive
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
|
||||
)
|
||||
|
||||
// --- Config validation ---------------------------------------------------
|
||||
|
||||
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
c := Config{}.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.ph" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
|
||||
@@ -15,23 +25,455 @@ func TestConfig_Validate_Defaults(t *testing.T) {
|
||||
if c.Timeout == nil {
|
||||
t.Fatal("Timeout should not be nil after validate")
|
||||
}
|
||||
if *c.Timeout != 1*time.Hour {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
|
||||
if *c.Timeout != DefaultTimeout {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
|
||||
}
|
||||
if DefaultTimeout != 5*time.Minute {
|
||||
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||
timeout := 5 * time.Minute
|
||||
timeout := 30 * time.Second
|
||||
c := Config{
|
||||
Endpoint: "https://archive.org",
|
||||
Timeout: &timeout,
|
||||
}
|
||||
c = c.validate()
|
||||
}.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.org" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
|
||||
}
|
||||
if *c.Timeout != 5*time.Minute {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
|
||||
if *c.Timeout != 30*time.Second {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// --- URL validation ------------------------------------------------------
|
||||
|
||||
func TestIsFinalSnapshotURL(t *testing.T) {
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
want bool
|
||||
}{
|
||||
{"front-page-empty", "https://archive.ph/", false},
|
||||
{"front-page-bare", "https://archive.ph", false},
|
||||
{"wip", "https://archive.ph/wip/abc12", false},
|
||||
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
|
||||
{"submit-bare", "https://archive.ph/submit", false},
|
||||
{"short-id-too-short", "https://archive.ph/ab", false},
|
||||
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
|
||||
{"short-id-5chars", "https://archive.ph/i9KU2", true},
|
||||
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
|
||||
{"o-prefix", "https://archive.ph/o/i9KU2", true},
|
||||
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
|
||||
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
|
||||
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
u, err := url.Parse(tc.raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %q: %v", tc.raw, err)
|
||||
}
|
||||
got := isFinalSnapshotURL(u, endpoint)
|
||||
if got != tc.want {
|
||||
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- DOM completion marker -----------------------------------------------
|
||||
|
||||
func TestHasCompletionMarker(t *testing.T) {
|
||||
t.Run("no markers", func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{},
|
||||
},
|
||||
}
|
||||
if hasCompletionMarker(doc) {
|
||||
t.Error("expected no completion marker on empty doc")
|
||||
}
|
||||
})
|
||||
|
||||
for _, sel := range completionSelectors {
|
||||
sel := sel
|
||||
t.Run("marker "+sel, func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
sel: {&extractortest.MockNode{}},
|
||||
},
|
||||
},
|
||||
}
|
||||
if !hasCompletionMarker(doc) {
|
||||
t.Errorf("expected completion marker via %q", sel)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- Selector cascade ----------------------------------------------------
|
||||
|
||||
func TestFindURLInput_Cascade(t *testing.T) {
|
||||
t.Run("first selector wins", func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
urlInputSelectors[0]: {&extractortest.MockNode{}},
|
||||
urlInputSelectors[1]: {&extractortest.MockNode{}},
|
||||
},
|
||||
},
|
||||
}
|
||||
n, sel := findURLInput(doc)
|
||||
if n == nil {
|
||||
t.Fatal("expected node")
|
||||
}
|
||||
if sel != urlInputSelectors[0] {
|
||||
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("falls back through cascade", func(t *testing.T) {
|
||||
// Only the LAST selector matches.
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
|
||||
},
|
||||
},
|
||||
}
|
||||
n, sel := findURLInput(doc)
|
||||
if n == nil {
|
||||
t.Fatal("expected node from last fallback")
|
||||
}
|
||||
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
|
||||
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("all selectors miss", func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{},
|
||||
},
|
||||
}
|
||||
n, sel := findURLInput(doc)
|
||||
if n != nil {
|
||||
t.Error("expected nil node")
|
||||
}
|
||||
if sel != "" {
|
||||
t.Errorf("selector = %q, want empty", sel)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestFindSubmitButton_Cascade(t *testing.T) {
|
||||
t.Run("first selector wins", func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
submitButtonSelectors[0]: {&extractortest.MockNode{}},
|
||||
},
|
||||
},
|
||||
}
|
||||
n, sel := findSubmitButton(doc)
|
||||
if n == nil {
|
||||
t.Fatal("expected node")
|
||||
}
|
||||
if sel != submitButtonSelectors[0] {
|
||||
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("falls back to button[type='submit']", func(t *testing.T) {
|
||||
// Use a known later-in-list selector.
|
||||
target := submitButtonSelectors[len(submitButtonSelectors)-1]
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
target: {&extractortest.MockNode{}},
|
||||
},
|
||||
},
|
||||
}
|
||||
n, sel := findSubmitButton(doc)
|
||||
if n == nil {
|
||||
t.Fatal("expected node from last fallback")
|
||||
}
|
||||
if sel != target {
|
||||
t.Errorf("selector = %q, want %q", sel, target)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("all selectors miss", func(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{},
|
||||
},
|
||||
}
|
||||
n, _ := findSubmitButton(doc)
|
||||
if n != nil {
|
||||
t.Error("expected nil node")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// --- Transient status detection -----------------------------------------
|
||||
|
||||
func TestIsTransientStatus(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil", nil, false},
|
||||
{"plain error", errors.New("oops"), false},
|
||||
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
|
||||
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
|
||||
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
|
||||
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
|
||||
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
|
||||
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := isTransientStatus(tc.err); got != tc.want {
|
||||
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- mutDoc: a Document whose URL + Children can be swapped under load --
|
||||
|
||||
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
|
||||
// so the polling loop sees consistent values from another goroutine.
|
||||
type mutDoc struct {
|
||||
mu sync.Mutex
|
||||
urlValue atomic.Value // string
|
||||
children atomic.Value // map[string]extractor.Nodes
|
||||
}
|
||||
|
||||
var _ extractor.Document = (*mutDoc)(nil)
|
||||
|
||||
func newMutDoc(initialURL string) *mutDoc {
|
||||
d := &mutDoc{}
|
||||
d.urlValue.Store(initialURL)
|
||||
d.children.Store(map[string]extractor.Nodes{})
|
||||
return d
|
||||
}
|
||||
|
||||
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
|
||||
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
d.children.Store(c)
|
||||
}
|
||||
|
||||
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
|
||||
func (d *mutDoc) Refresh() error { return nil }
|
||||
func (d *mutDoc) Close() error { return nil }
|
||||
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
|
||||
func (d *mutDoc) Content() (string, error) { return "", nil }
|
||||
func (d *mutDoc) Text() (string, error) { return "", nil }
|
||||
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
|
||||
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
|
||||
func (d *mutDoc) Type(_ string) error { return nil }
|
||||
func (d *mutDoc) Click() error { return nil }
|
||||
func (d *mutDoc) SetHidden(_ bool) error { return nil }
|
||||
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
|
||||
|
||||
func (d *mutDoc) Select(selector string) extractor.Nodes {
|
||||
c := d.children.Load().(map[string]extractor.Nodes)
|
||||
return c[selector]
|
||||
}
|
||||
|
||||
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
|
||||
return d.Select(selector).First()
|
||||
}
|
||||
|
||||
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
|
||||
for _, n := range d.Select(selector) {
|
||||
if err := fn(n); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- pollUntilArchived ---------------------------------------------------
|
||||
|
||||
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
doc := newMutDoc("https://archive.ph/wip/abc12")
|
||||
// No completion markers; URL stays on /wip/.
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
err := pollUntilArchived(ctx, doc, endpoint)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if !errors.Is(err, ErrArchiveIncomplete) {
|
||||
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
|
||||
}
|
||||
if !errors.Is(err, context.DeadlineExceeded) {
|
||||
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
doc := newMutDoc("https://archive.ph/wip/abc12")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
// Cancel after a brief delay so the polling loop is already inside its
|
||||
// select.
|
||||
go func() {
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
err := pollUntilArchived(ctx, doc, endpoint)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if errors.Is(err, ErrArchiveIncomplete) {
|
||||
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
|
||||
}
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Errorf("expected wrapped context.Canceled, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
doc := newMutDoc("https://archive.ph/wip/abc12")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// After a short delay, transition to a final URL but WITHOUT a DOM
|
||||
// marker. Poll should keep waiting. Then add the marker.
|
||||
go func() {
|
||||
time.Sleep(40 * time.Millisecond)
|
||||
doc.setURL("https://archive.ph/i9KU2")
|
||||
// No marker yet — poll should still wait.
|
||||
time.Sleep(60 * time.Millisecond)
|
||||
doc.setChildren(map[string]extractor.Nodes{
|
||||
"div#HEADER": {&extractortest.MockNode{}},
|
||||
})
|
||||
}()
|
||||
|
||||
err := pollUntilArchived(ctx, doc, endpoint)
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil after URL+marker transition, got %v", err)
|
||||
}
|
||||
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
|
||||
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
|
||||
// URL transitions to a final-looking path but the DOM never grows a
|
||||
// completion marker. Poll should hit the deadline.
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
doc := newMutDoc("https://archive.ph/wip/abc12")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
|
||||
}()
|
||||
|
||||
err := pollUntilArchived(ctx, doc, endpoint)
|
||||
if !errors.Is(err, ErrArchiveIncomplete) {
|
||||
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- isArchiveComplete combination ---------------------------------------
|
||||
|
||||
func TestIsArchiveComplete(t *testing.T) {
|
||||
endpoint, _ := url.Parse("https://archive.ph")
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
marker bool
|
||||
want bool
|
||||
}{
|
||||
{"both ok", "https://archive.ph/i9KU2", true, true},
|
||||
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
|
||||
{"final url no marker", "https://archive.ph/i9KU2", false, false},
|
||||
{"front page with marker", "https://archive.ph/", true, false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
children := map[string]extractor.Nodes{}
|
||||
if tc.marker {
|
||||
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
|
||||
}
|
||||
doc := &extractortest.MockDocument{
|
||||
URLValue: tc.raw,
|
||||
MockNode: extractortest.MockNode{Children: children},
|
||||
}
|
||||
got := isArchiveComplete(doc, endpoint)
|
||||
if got != tc.want {
|
||||
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- Archive: selector cascade failure path ------------------------------
|
||||
|
||||
// Note: the full Archive() flow drives a live browser. We can still cover
|
||||
// the "form selectors all missing" branch via a custom Browser that returns
|
||||
// a mutDoc with no children — the URL/typing path doesn't run because the
|
||||
// selector lookup fails first.
|
||||
|
||||
type emptyFormBrowser struct {
|
||||
doc extractor.Document
|
||||
}
|
||||
|
||||
func (b *emptyFormBrowser) Close() error { return nil }
|
||||
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
|
||||
return b.doc, nil
|
||||
}
|
||||
|
||||
func TestArchive_SelectorMissing(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
URLValue: "https://archive.ph/",
|
||||
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
|
||||
}
|
||||
b := &emptyFormBrowser{doc: doc}
|
||||
|
||||
timeout := 200 * time.Millisecond
|
||||
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
|
||||
if err == nil {
|
||||
t.Fatal("expected error when form selectors are missing")
|
||||
}
|
||||
if !errors.Is(err, ErrArchiveSelectorMissing) {
|
||||
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- helpers -------------------------------------------------------------
|
||||
|
||||
func mustParse(t *testing.T, raw string) *url.URL {
|
||||
t.Helper()
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %q: %v", raw, err)
|
||||
}
|
||||
return u
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user