45fa7c4e8f
The archive.ph submission flow had several defects that caused Mort's summary fallback to return placeholder "Working..." pages instead of real archived content, or hang for the full timeout: - Context cancellation in the poll loop fell through to a final WaitForNetworkIdle and returned the doc as success. The function now returns a typed error (ErrArchiveIncomplete on deadline, wrapped ctx.Err() on caller cancel). - The poll only checked doc.URL() — if archive.ph's JS got wedged on /wip/<id>, the loop spun until timeout. Completion now also requires a DOM marker (#HEADER, [id^="SHARE"], .TEXT-BLOCK) so URL-only transitions don't satisfy the check. - The final URL is now validated against an alphanumeric ID pattern, rejecting /wip/, /submit, /newest/ and the front page. - 5-second blind sleep before polling replaced with a bounded WaitForNetworkIdle that short-circuits when already archived. - Form selectors now use a cascade (input[name='url'] → input[type='url'] → input.input-url → input[name='anyway'], and similar for the submit button) so a single archive.ph markup change doesn't kill the flow. Errors name which selectors were tried. - Default timeout lowered from 1 hour to 5 minutes (still overridable via context deadline). Exposed as DefaultTimeout. - Poll progress is now logged at slog.Info every 30s so production logs surface stuck flows. - Front-page 5xx now retries twice with 1s/4s backoff before failing. - New exported sentinels: ErrArchiveIncomplete, ErrArchiveSelectorMissing. - Tests cover URL validator (incl. /wip/, /newest/, short IDs, o-prefix), selector cascade, DOM completion detector, transient status classification, and ctx cancellation paths via a thread-safe mutating mock document. Full integration with a live browser remains hand-tested. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
480 lines
14 KiB
Go
480 lines
14 KiB
Go
package archive
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"sync"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
|
|
)
|
|
|
|
// --- Config validation ---------------------------------------------------
|
|
|
|
func TestConfig_Validate_Defaults(t *testing.T) {
|
|
c := Config{}.validate()
|
|
|
|
if c.Endpoint != "https://archive.ph" {
|
|
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
|
|
}
|
|
if c.Timeout == nil {
|
|
t.Fatal("Timeout should not be nil after validate")
|
|
}
|
|
if *c.Timeout != DefaultTimeout {
|
|
t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout)
|
|
}
|
|
if DefaultTimeout != 5*time.Minute {
|
|
t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute)
|
|
}
|
|
}
|
|
|
|
func TestConfig_Validate_Preserves(t *testing.T) {
|
|
timeout := 30 * time.Second
|
|
c := Config{
|
|
Endpoint: "https://archive.org",
|
|
Timeout: &timeout,
|
|
}.validate()
|
|
|
|
if c.Endpoint != "https://archive.org" {
|
|
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
|
|
}
|
|
if *c.Timeout != 30*time.Second {
|
|
t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second)
|
|
}
|
|
}
|
|
|
|
// --- URL validation ------------------------------------------------------
|
|
|
|
func TestIsFinalSnapshotURL(t *testing.T) {
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
|
|
cases := []struct {
|
|
name string
|
|
raw string
|
|
want bool
|
|
}{
|
|
{"front-page-empty", "https://archive.ph/", false},
|
|
{"front-page-bare", "https://archive.ph", false},
|
|
{"wip", "https://archive.ph/wip/abc12", false},
|
|
{"submit-trailing", "https://archive.ph/submit/?url=foo", false},
|
|
{"submit-bare", "https://archive.ph/submit", false},
|
|
{"short-id-too-short", "https://archive.ph/ab", false},
|
|
{"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id
|
|
{"short-id-5chars", "https://archive.ph/i9KU2", true},
|
|
{"short-id-7chars", "https://archive.ph/aBcD9E2", true},
|
|
{"o-prefix", "https://archive.ph/o/i9KU2", true},
|
|
{"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true},
|
|
{"id-with-source", "https://archive.ph/i9KU2/https://example.com", true},
|
|
{"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
u, err := url.Parse(tc.raw)
|
|
if err != nil {
|
|
t.Fatalf("parse %q: %v", tc.raw, err)
|
|
}
|
|
got := isFinalSnapshotURL(u, endpoint)
|
|
if got != tc.want {
|
|
t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- DOM completion marker -----------------------------------------------
|
|
|
|
func TestHasCompletionMarker(t *testing.T) {
|
|
t.Run("no markers", func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{},
|
|
},
|
|
}
|
|
if hasCompletionMarker(doc) {
|
|
t.Error("expected no completion marker on empty doc")
|
|
}
|
|
})
|
|
|
|
for _, sel := range completionSelectors {
|
|
sel := sel
|
|
t.Run("marker "+sel, func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{
|
|
sel: {&extractortest.MockNode{}},
|
|
},
|
|
},
|
|
}
|
|
if !hasCompletionMarker(doc) {
|
|
t.Errorf("expected completion marker via %q", sel)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- Selector cascade ----------------------------------------------------
|
|
|
|
func TestFindURLInput_Cascade(t *testing.T) {
|
|
t.Run("first selector wins", func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{
|
|
urlInputSelectors[0]: {&extractortest.MockNode{}},
|
|
urlInputSelectors[1]: {&extractortest.MockNode{}},
|
|
},
|
|
},
|
|
}
|
|
n, sel := findURLInput(doc)
|
|
if n == nil {
|
|
t.Fatal("expected node")
|
|
}
|
|
if sel != urlInputSelectors[0] {
|
|
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0])
|
|
}
|
|
})
|
|
|
|
t.Run("falls back through cascade", func(t *testing.T) {
|
|
// Only the LAST selector matches.
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{
|
|
urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}},
|
|
},
|
|
},
|
|
}
|
|
n, sel := findURLInput(doc)
|
|
if n == nil {
|
|
t.Fatal("expected node from last fallback")
|
|
}
|
|
if sel != urlInputSelectors[len(urlInputSelectors)-1] {
|
|
t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1])
|
|
}
|
|
})
|
|
|
|
t.Run("all selectors miss", func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{},
|
|
},
|
|
}
|
|
n, sel := findURLInput(doc)
|
|
if n != nil {
|
|
t.Error("expected nil node")
|
|
}
|
|
if sel != "" {
|
|
t.Errorf("selector = %q, want empty", sel)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestFindSubmitButton_Cascade(t *testing.T) {
|
|
t.Run("first selector wins", func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{
|
|
submitButtonSelectors[0]: {&extractortest.MockNode{}},
|
|
},
|
|
},
|
|
}
|
|
n, sel := findSubmitButton(doc)
|
|
if n == nil {
|
|
t.Fatal("expected node")
|
|
}
|
|
if sel != submitButtonSelectors[0] {
|
|
t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0])
|
|
}
|
|
})
|
|
|
|
t.Run("falls back to button[type='submit']", func(t *testing.T) {
|
|
// Use a known later-in-list selector.
|
|
target := submitButtonSelectors[len(submitButtonSelectors)-1]
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{
|
|
target: {&extractortest.MockNode{}},
|
|
},
|
|
},
|
|
}
|
|
n, sel := findSubmitButton(doc)
|
|
if n == nil {
|
|
t.Fatal("expected node from last fallback")
|
|
}
|
|
if sel != target {
|
|
t.Errorf("selector = %q, want %q", sel, target)
|
|
}
|
|
})
|
|
|
|
t.Run("all selectors miss", func(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
MockNode: extractortest.MockNode{
|
|
Children: map[string]extractor.Nodes{},
|
|
},
|
|
}
|
|
n, _ := findSubmitButton(doc)
|
|
if n != nil {
|
|
t.Error("expected nil node")
|
|
}
|
|
})
|
|
}
|
|
|
|
// --- Transient status detection -----------------------------------------
|
|
|
|
func TestIsTransientStatus(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
err error
|
|
want bool
|
|
}{
|
|
{"nil", nil, false},
|
|
{"plain error", errors.New("oops"), false},
|
|
{"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true},
|
|
{"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true},
|
|
{"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true},
|
|
{"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false},
|
|
{"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false},
|
|
{"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false},
|
|
}
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
if got := isTransientStatus(tc.err); got != tc.want {
|
|
t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- mutDoc: a Document whose URL + Children can be swapped under load --
|
|
|
|
// mutDoc embeds MockDocument and protects URL/Children swaps with a mutex
|
|
// so the polling loop sees consistent values from another goroutine.
|
|
type mutDoc struct {
|
|
mu sync.Mutex
|
|
urlValue atomic.Value // string
|
|
children atomic.Value // map[string]extractor.Nodes
|
|
}
|
|
|
|
var _ extractor.Document = (*mutDoc)(nil)
|
|
|
|
func newMutDoc(initialURL string) *mutDoc {
|
|
d := &mutDoc{}
|
|
d.urlValue.Store(initialURL)
|
|
d.children.Store(map[string]extractor.Nodes{})
|
|
return d
|
|
}
|
|
|
|
func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) }
|
|
func (d *mutDoc) setChildren(c map[string]extractor.Nodes) {
|
|
d.mu.Lock()
|
|
defer d.mu.Unlock()
|
|
d.children.Store(c)
|
|
}
|
|
|
|
func (d *mutDoc) URL() string { return d.urlValue.Load().(string) }
|
|
func (d *mutDoc) Refresh() error { return nil }
|
|
func (d *mutDoc) Close() error { return nil }
|
|
func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil }
|
|
func (d *mutDoc) Content() (string, error) { return "", nil }
|
|
func (d *mutDoc) Text() (string, error) { return "", nil }
|
|
func (d *mutDoc) Attr(_ string) (string, error) { return "", nil }
|
|
func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil }
|
|
func (d *mutDoc) Type(_ string) error { return nil }
|
|
func (d *mutDoc) Click() error { return nil }
|
|
func (d *mutDoc) SetHidden(_ bool) error { return nil }
|
|
func (d *mutDoc) SetAttribute(_, _ string) error { return nil }
|
|
|
|
func (d *mutDoc) Select(selector string) extractor.Nodes {
|
|
c := d.children.Load().(map[string]extractor.Nodes)
|
|
return c[selector]
|
|
}
|
|
|
|
func (d *mutDoc) SelectFirst(selector string) extractor.Node {
|
|
return d.Select(selector).First()
|
|
}
|
|
|
|
func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error {
|
|
for _, n := range d.Select(selector) {
|
|
if err := fn(n); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// --- pollUntilArchived ---------------------------------------------------
|
|
|
|
func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) {
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
doc := newMutDoc("https://archive.ph/wip/abc12")
|
|
// No completion markers; URL stays on /wip/.
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
|
defer cancel()
|
|
|
|
err := pollUntilArchived(ctx, doc, endpoint)
|
|
if err == nil {
|
|
t.Fatal("expected error, got nil")
|
|
}
|
|
if !errors.Is(err, ErrArchiveIncomplete) {
|
|
t.Errorf("expected ErrArchiveIncomplete, got %v", err)
|
|
}
|
|
if !errors.Is(err, context.DeadlineExceeded) {
|
|
t.Errorf("expected wrapped DeadlineExceeded, got %v", err)
|
|
}
|
|
}
|
|
|
|
func TestPollUntilArchived_CallerCancelled(t *testing.T) {
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
doc := newMutDoc("https://archive.ph/wip/abc12")
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
// Cancel after a brief delay so the polling loop is already inside its
|
|
// select.
|
|
go func() {
|
|
time.Sleep(20 * time.Millisecond)
|
|
cancel()
|
|
}()
|
|
|
|
err := pollUntilArchived(ctx, doc, endpoint)
|
|
if err == nil {
|
|
t.Fatal("expected error, got nil")
|
|
}
|
|
if errors.Is(err, ErrArchiveIncomplete) {
|
|
t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err)
|
|
}
|
|
if !errors.Is(err, context.Canceled) {
|
|
t.Errorf("expected wrapped context.Canceled, got %v", err)
|
|
}
|
|
}
|
|
|
|
func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) {
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
doc := newMutDoc("https://archive.ph/wip/abc12")
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
|
defer cancel()
|
|
|
|
// After a short delay, transition to a final URL but WITHOUT a DOM
|
|
// marker. Poll should keep waiting. Then add the marker.
|
|
go func() {
|
|
time.Sleep(40 * time.Millisecond)
|
|
doc.setURL("https://archive.ph/i9KU2")
|
|
// No marker yet — poll should still wait.
|
|
time.Sleep(60 * time.Millisecond)
|
|
doc.setChildren(map[string]extractor.Nodes{
|
|
"div#HEADER": {&extractortest.MockNode{}},
|
|
})
|
|
}()
|
|
|
|
err := pollUntilArchived(ctx, doc, endpoint)
|
|
if err != nil {
|
|
t.Fatalf("expected nil after URL+marker transition, got %v", err)
|
|
}
|
|
if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) {
|
|
t.Errorf("final URL %q does not look like a snapshot", doc.URL())
|
|
}
|
|
}
|
|
|
|
func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) {
|
|
// URL transitions to a final-looking path but the DOM never grows a
|
|
// completion marker. Poll should hit the deadline.
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
doc := newMutDoc("https://archive.ph/wip/abc12")
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
|
defer cancel()
|
|
|
|
go func() {
|
|
time.Sleep(10 * time.Millisecond)
|
|
doc.setURL("https://archive.ph/i9KU2") // looks final but no marker
|
|
}()
|
|
|
|
err := pollUntilArchived(ctx, doc, endpoint)
|
|
if !errors.Is(err, ErrArchiveIncomplete) {
|
|
t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err)
|
|
}
|
|
}
|
|
|
|
// --- isArchiveComplete combination ---------------------------------------
|
|
|
|
func TestIsArchiveComplete(t *testing.T) {
|
|
endpoint, _ := url.Parse("https://archive.ph")
|
|
|
|
cases := []struct {
|
|
name string
|
|
raw string
|
|
marker bool
|
|
want bool
|
|
}{
|
|
{"both ok", "https://archive.ph/i9KU2", true, true},
|
|
{"wip url with marker", "https://archive.ph/wip/abc12", true, false},
|
|
{"final url no marker", "https://archive.ph/i9KU2", false, false},
|
|
{"front page with marker", "https://archive.ph/", true, false},
|
|
}
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
children := map[string]extractor.Nodes{}
|
|
if tc.marker {
|
|
children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}}
|
|
}
|
|
doc := &extractortest.MockDocument{
|
|
URLValue: tc.raw,
|
|
MockNode: extractortest.MockNode{Children: children},
|
|
}
|
|
got := isArchiveComplete(doc, endpoint)
|
|
if got != tc.want {
|
|
t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- Archive: selector cascade failure path ------------------------------
|
|
|
|
// Note: the full Archive() flow drives a live browser. We can still cover
|
|
// the "form selectors all missing" branch via a custom Browser that returns
|
|
// a mutDoc with no children — the URL/typing path doesn't run because the
|
|
// selector lookup fails first.
|
|
|
|
type emptyFormBrowser struct {
|
|
doc extractor.Document
|
|
}
|
|
|
|
func (b *emptyFormBrowser) Close() error { return nil }
|
|
func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) {
|
|
return b.doc, nil
|
|
}
|
|
|
|
func TestArchive_SelectorMissing(t *testing.T) {
|
|
doc := &extractortest.MockDocument{
|
|
URLValue: "https://archive.ph/",
|
|
MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}},
|
|
}
|
|
b := &emptyFormBrowser{doc: doc}
|
|
|
|
timeout := 200 * time.Millisecond
|
|
_, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com")
|
|
if err == nil {
|
|
t.Fatal("expected error when form selectors are missing")
|
|
}
|
|
if !errors.Is(err, ErrArchiveSelectorMissing) {
|
|
t.Errorf("expected ErrArchiveSelectorMissing, got %v", err)
|
|
}
|
|
}
|
|
|
|
// --- helpers -------------------------------------------------------------
|
|
|
|
func mustParse(t *testing.T, raw string) *url.URL {
|
|
t.Helper()
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
t.Fatalf("parse %q: %v", raw, err)
|
|
}
|
|
return u
|
|
}
|