diff --git a/sites/duckduckgo/duckduckgo.go b/sites/duckduckgo/duckduckgo.go index de07d97..64ee878 100644 --- a/sites/duckduckgo/duckduckgo.go +++ b/sites/duckduckgo/duckduckgo.go @@ -2,6 +2,7 @@ package duckduckgo import ( "context" + "errors" "fmt" "log/slog" "net/url" @@ -9,6 +10,10 @@ import ( "gitea.stevedudenhoeffer.com/steve/go-extractor" ) +// ErrBlocked is returned when DuckDuckGo serves an anti-bot challenge +// page instead of search results. +var ErrBlocked = errors.New("duckduckgo: blocked by anti-bot challenge") + type SafeSearch int const ( diff --git a/sites/duckduckgo/extract_test.go b/sites/duckduckgo/extract_test.go index 78c6678..4fbe993 100644 --- a/sites/duckduckgo/extract_test.go +++ b/sites/duckduckgo/extract_test.go @@ -2,6 +2,7 @@ package duckduckgo import ( "context" + "errors" "testing" "gitea.stevedudenhoeffer.com/steve/go-extractor" @@ -106,6 +107,26 @@ func TestExtractResults_NoLinks(t *testing.T) { } } +func TestExtractResults_Blocked(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + ".anomaly-modal__title": { + &extractortest.MockNode{TextValue: "Unfortunately, bots use DuckDuckGo too."}, + }, + }, + }, + } + + results, err := extractResults(doc) + if !errors.Is(err, ErrBlocked) { + t.Fatalf("expected ErrBlocked, got %v", err) + } + if results != nil { + t.Errorf("expected nil results when blocked, got %v", results) + } +} + func TestSearch_UsesMockBrowser(t *testing.T) { doc := &extractortest.MockDocument{ URLValue: "https://duckduckgo.com/?q=test", diff --git a/sites/duckduckgo/page.go b/sites/duckduckgo/page.go index 88f2bae..44039b9 100644 --- a/sites/duckduckgo/page.go +++ b/sites/duckduckgo/page.go @@ -23,9 +23,17 @@ func (s searchPage) GetResults() ([]Result, error) { } // extractResults parses search results from a DuckDuckGo results page. +// +// If the page is an anti-bot challenge ("Unfortunately, bots use DuckDuckGo +// too...") rather than results, returns ErrBlocked so callers can distinguish +// "blocked" from "no matches." func extractResults(doc extractor.Node) ([]Result, error) { var res []Result + if isBlocked(doc) { + return nil, ErrBlocked + } + err := doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error { var r Result @@ -68,6 +76,12 @@ func extractResults(doc extractor.Node) ([]Result, error) { return res, err } +// isBlocked reports whether the page is the DuckDuckGo anti-bot challenge +// modal (".anomaly-modal__title") rather than a normal results page. +func isBlocked(doc extractor.Node) bool { + return len(doc.Select(".anomaly-modal__title")) > 0 +} + func (s searchPage) LoadMore() error { return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error { slog.Info("clicking load more", "node", n)