package archive import ( "context" "errors" "fmt" "net/url" "sync" "sync/atomic" "testing" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" "gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest" ) // --- Config validation --------------------------------------------------- func TestConfig_Validate_Defaults(t *testing.T) { c := Config{}.validate() if c.Endpoint != "https://archive.ph" { t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph") } if c.Timeout == nil { t.Fatal("Timeout should not be nil after validate") } if *c.Timeout != DefaultTimeout { t.Errorf("Timeout = %v, want %v", *c.Timeout, DefaultTimeout) } if DefaultTimeout != 5*time.Minute { t.Errorf("DefaultTimeout = %v, want %v", DefaultTimeout, 5*time.Minute) } } func TestConfig_Validate_Preserves(t *testing.T) { timeout := 30 * time.Second c := Config{ Endpoint: "https://archive.org", Timeout: &timeout, }.validate() if c.Endpoint != "https://archive.org" { t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org") } if *c.Timeout != 30*time.Second { t.Errorf("Timeout = %v, want %v", *c.Timeout, 30*time.Second) } } // --- URL validation ------------------------------------------------------ func TestIsFinalSnapshotURL(t *testing.T) { endpoint, _ := url.Parse("https://archive.ph") cases := []struct { name string raw string want bool }{ {"front-page-empty", "https://archive.ph/", false}, {"front-page-bare", "https://archive.ph", false}, {"wip", "https://archive.ph/wip/abc12", false}, {"submit-trailing", "https://archive.ph/submit/?url=foo", false}, {"submit-bare", "https://archive.ph/submit", false}, {"short-id-too-short", "https://archive.ph/ab", false}, {"newest-redirect-target", "https://archive.ph/newest/https://example.com", false}, // path starts with /newest/ → no leading id {"short-id-5chars", "https://archive.ph/i9KU2", true}, {"short-id-7chars", "https://archive.ph/aBcD9E2", true}, {"o-prefix", "https://archive.ph/o/i9KU2", true}, {"o-prefix-with-source", "https://archive.ph/o/i9KU2/https://example.com", true}, {"id-with-source", "https://archive.ph/i9KU2/https://example.com", true}, {"foreign-host", "https://example.com/i9KU2", true}, // off-host but resolved somewhere — treat as success } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { u, err := url.Parse(tc.raw) if err != nil { t.Fatalf("parse %q: %v", tc.raw, err) } got := isFinalSnapshotURL(u, endpoint) if got != tc.want { t.Errorf("isFinalSnapshotURL(%q) = %v, want %v", tc.raw, got, tc.want) } }) } } // --- DOM completion marker ----------------------------------------------- func TestHasCompletionMarker(t *testing.T) { t.Run("no markers", func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{}, }, } if hasCompletionMarker(doc) { t.Error("expected no completion marker on empty doc") } }) for _, sel := range completionSelectors { sel := sel t.Run("marker "+sel, func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{ sel: {&extractortest.MockNode{}}, }, }, } if !hasCompletionMarker(doc) { t.Errorf("expected completion marker via %q", sel) } }) } } // --- Selector cascade ---------------------------------------------------- func TestFindURLInput_Cascade(t *testing.T) { t.Run("first selector wins", func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{ urlInputSelectors[0]: {&extractortest.MockNode{}}, urlInputSelectors[1]: {&extractortest.MockNode{}}, }, }, } n, sel := findURLInput(doc) if n == nil { t.Fatal("expected node") } if sel != urlInputSelectors[0] { t.Errorf("selector = %q, want %q", sel, urlInputSelectors[0]) } }) t.Run("falls back through cascade", func(t *testing.T) { // Only the LAST selector matches. doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{ urlInputSelectors[len(urlInputSelectors)-1]: {&extractortest.MockNode{}}, }, }, } n, sel := findURLInput(doc) if n == nil { t.Fatal("expected node from last fallback") } if sel != urlInputSelectors[len(urlInputSelectors)-1] { t.Errorf("selector = %q, want %q", sel, urlInputSelectors[len(urlInputSelectors)-1]) } }) t.Run("all selectors miss", func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{}, }, } n, sel := findURLInput(doc) if n != nil { t.Error("expected nil node") } if sel != "" { t.Errorf("selector = %q, want empty", sel) } }) } func TestFindSubmitButton_Cascade(t *testing.T) { t.Run("first selector wins", func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{ submitButtonSelectors[0]: {&extractortest.MockNode{}}, }, }, } n, sel := findSubmitButton(doc) if n == nil { t.Fatal("expected node") } if sel != submitButtonSelectors[0] { t.Errorf("selector = %q, want %q", sel, submitButtonSelectors[0]) } }) t.Run("falls back to button[type='submit']", func(t *testing.T) { // Use a known later-in-list selector. target := submitButtonSelectors[len(submitButtonSelectors)-1] doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{ target: {&extractortest.MockNode{}}, }, }, } n, sel := findSubmitButton(doc) if n == nil { t.Fatal("expected node from last fallback") } if sel != target { t.Errorf("selector = %q, want %q", sel, target) } }) t.Run("all selectors miss", func(t *testing.T) { doc := &extractortest.MockDocument{ MockNode: extractortest.MockNode{ Children: map[string]extractor.Nodes{}, }, } n, _ := findSubmitButton(doc) if n != nil { t.Error("expected nil node") } }) } // --- Transient status detection ----------------------------------------- func TestIsTransientStatus(t *testing.T) { cases := []struct { name string err error want bool }{ {"nil", nil, false}, {"plain error", errors.New("oops"), false}, {"500", fmt.Errorf("%w: 500", extractor.ErrInvalidStatusCode), true}, {"502", fmt.Errorf("%w: 502", extractor.ErrInvalidStatusCode), true}, {"503", fmt.Errorf("%w: 503", extractor.ErrInvalidStatusCode), true}, {"403", fmt.Errorf("%w: 403", extractor.ErrInvalidStatusCode), false}, {"404", fmt.Errorf("%w: 404", extractor.ErrInvalidStatusCode), false}, {"401", fmt.Errorf("%w: 401", extractor.ErrInvalidStatusCode), false}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { if got := isTransientStatus(tc.err); got != tc.want { t.Errorf("isTransientStatus(%v) = %v, want %v", tc.err, got, tc.want) } }) } } // --- mutDoc: a Document whose URL + Children can be swapped under load -- // mutDoc embeds MockDocument and protects URL/Children swaps with a mutex // so the polling loop sees consistent values from another goroutine. type mutDoc struct { mu sync.Mutex urlValue atomic.Value // string children atomic.Value // map[string]extractor.Nodes } var _ extractor.Document = (*mutDoc)(nil) func newMutDoc(initialURL string) *mutDoc { d := &mutDoc{} d.urlValue.Store(initialURL) d.children.Store(map[string]extractor.Nodes{}) return d } func (d *mutDoc) setURL(u string) { d.urlValue.Store(u) } func (d *mutDoc) setChildren(c map[string]extractor.Nodes) { d.mu.Lock() defer d.mu.Unlock() d.children.Store(c) } func (d *mutDoc) URL() string { return d.urlValue.Load().(string) } func (d *mutDoc) Refresh() error { return nil } func (d *mutDoc) Close() error { return nil } func (d *mutDoc) WaitForNetworkIdle(_ *time.Duration) error { return nil } func (d *mutDoc) Content() (string, error) { return "", nil } func (d *mutDoc) Text() (string, error) { return "", nil } func (d *mutDoc) Attr(_ string) (string, error) { return "", nil } func (d *mutDoc) Screenshot() ([]byte, error) { return nil, nil } func (d *mutDoc) Type(_ string) error { return nil } func (d *mutDoc) Click() error { return nil } func (d *mutDoc) SetHidden(_ bool) error { return nil } func (d *mutDoc) SetAttribute(_, _ string) error { return nil } func (d *mutDoc) Select(selector string) extractor.Nodes { c := d.children.Load().(map[string]extractor.Nodes) return c[selector] } func (d *mutDoc) SelectFirst(selector string) extractor.Node { return d.Select(selector).First() } func (d *mutDoc) ForEach(selector string, fn func(extractor.Node) error) error { for _, n := range d.Select(selector) { if err := fn(n); err != nil { return err } } return nil } // --- pollUntilArchived --------------------------------------------------- func TestPollUntilArchived_ContextCancelled_NeverCompletes(t *testing.T) { endpoint, _ := url.Parse("https://archive.ph") doc := newMutDoc("https://archive.ph/wip/abc12") // No completion markers; URL stays on /wip/. ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) defer cancel() err := pollUntilArchived(ctx, doc, endpoint) if err == nil { t.Fatal("expected error, got nil") } if !errors.Is(err, ErrArchiveIncomplete) { t.Errorf("expected ErrArchiveIncomplete, got %v", err) } if !errors.Is(err, context.DeadlineExceeded) { t.Errorf("expected wrapped DeadlineExceeded, got %v", err) } } func TestPollUntilArchived_CallerCancelled(t *testing.T) { endpoint, _ := url.Parse("https://archive.ph") doc := newMutDoc("https://archive.ph/wip/abc12") ctx, cancel := context.WithCancel(context.Background()) // Cancel after a brief delay so the polling loop is already inside its // select. go func() { time.Sleep(20 * time.Millisecond) cancel() }() err := pollUntilArchived(ctx, doc, endpoint) if err == nil { t.Fatal("expected error, got nil") } if errors.Is(err, ErrArchiveIncomplete) { t.Errorf("non-deadline cancellation should NOT be ErrArchiveIncomplete, got %v", err) } if !errors.Is(err, context.Canceled) { t.Errorf("expected wrapped context.Canceled, got %v", err) } } func TestPollUntilArchived_SuccessRequiresBothURLAndMarker(t *testing.T) { endpoint, _ := url.Parse("https://archive.ph") doc := newMutDoc("https://archive.ph/wip/abc12") ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() // After a short delay, transition to a final URL but WITHOUT a DOM // marker. Poll should keep waiting. Then add the marker. go func() { time.Sleep(40 * time.Millisecond) doc.setURL("https://archive.ph/i9KU2") // No marker yet — poll should still wait. time.Sleep(60 * time.Millisecond) doc.setChildren(map[string]extractor.Nodes{ "div#HEADER": {&extractortest.MockNode{}}, }) }() err := pollUntilArchived(ctx, doc, endpoint) if err != nil { t.Fatalf("expected nil after URL+marker transition, got %v", err) } if !isFinalSnapshotURL(mustParse(t, doc.URL()), endpoint) { t.Errorf("final URL %q does not look like a snapshot", doc.URL()) } } func TestPollUntilArchived_URLOnly_NotEnough(t *testing.T) { // URL transitions to a final-looking path but the DOM never grows a // completion marker. Poll should hit the deadline. endpoint, _ := url.Parse("https://archive.ph") doc := newMutDoc("https://archive.ph/wip/abc12") ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) defer cancel() go func() { time.Sleep(10 * time.Millisecond) doc.setURL("https://archive.ph/i9KU2") // looks final but no marker }() err := pollUntilArchived(ctx, doc, endpoint) if !errors.Is(err, ErrArchiveIncomplete) { t.Errorf("expected ErrArchiveIncomplete when URL transitions but no marker; got %v", err) } } // --- isArchiveComplete combination --------------------------------------- func TestIsArchiveComplete(t *testing.T) { endpoint, _ := url.Parse("https://archive.ph") cases := []struct { name string raw string marker bool want bool }{ {"both ok", "https://archive.ph/i9KU2", true, true}, {"wip url with marker", "https://archive.ph/wip/abc12", true, false}, {"final url no marker", "https://archive.ph/i9KU2", false, false}, {"front page with marker", "https://archive.ph/", true, false}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { children := map[string]extractor.Nodes{} if tc.marker { children["div#HEADER"] = extractor.Nodes{&extractortest.MockNode{}} } doc := &extractortest.MockDocument{ URLValue: tc.raw, MockNode: extractortest.MockNode{Children: children}, } got := isArchiveComplete(doc, endpoint) if got != tc.want { t.Errorf("isArchiveComplete(%q, marker=%v) = %v, want %v", tc.raw, tc.marker, got, tc.want) } }) } } // --- Archive: selector cascade failure path ------------------------------ // Note: the full Archive() flow drives a live browser. We can still cover // the "form selectors all missing" branch via a custom Browser that returns // a mutDoc with no children — the URL/typing path doesn't run because the // selector lookup fails first. type emptyFormBrowser struct { doc extractor.Document } func (b *emptyFormBrowser) Close() error { return nil } func (b *emptyFormBrowser) Open(_ context.Context, _ string, _ extractor.OpenPageOptions) (extractor.Document, error) { return b.doc, nil } func TestArchive_SelectorMissing(t *testing.T) { doc := &extractortest.MockDocument{ URLValue: "https://archive.ph/", MockNode: extractortest.MockNode{Children: map[string]extractor.Nodes{}}, } b := &emptyFormBrowser{doc: doc} timeout := 200 * time.Millisecond _, err := (Config{Timeout: &timeout}).Archive(context.Background(), b, "https://example.com") if err == nil { t.Fatal("expected error when form selectors are missing") } if !errors.Is(err, ErrArchiveSelectorMissing) { t.Errorf("expected ErrArchiveSelectorMissing, got %v", err) } } // --- helpers ------------------------------------------------------------- func mustParse(t *testing.T, raw string) *url.URL { t.Helper() u, err := url.Parse(raw) if err != nil { t.Fatalf("parse %q: %v", raw, err) } return u }