Files
go-extractor/sites/duckduckgo/page.go
T
steve 841f1ec2bf
CI / build (push) Successful in 1m0s
CI / test (push) Successful in 1m8s
CI / vet (push) Successful in 1m12s
feat(duckduckgo): detect anti-bot challenge and surface ErrBlocked
DuckDuckGo intermittently serves a CAPTCHA modal ("Unfortunately, bots
use DuckDuckGo too...") instead of search results. The result selector
matches zero elements on that page, so callers used to get
([]Result{}, nil) — silent empty results that look like "no matches."

Detect the challenge via the BEM class .anomaly-modal__title and return
a typed ErrBlocked so callers can distinguish blocked from no-results
and react (retry, fallback to another engine, surface to user).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 23:25:28 +00:00

95 lines
1.9 KiB
Go

package duckduckgo
import (
"fmt"
"io"
"log/slog"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type SearchPage interface {
io.Closer
GetResults() ([]Result, error)
LoadMore() error
}
type searchPage struct {
doc extractor.Document
}
func (s searchPage) GetResults() ([]Result, error) {
return extractResults(s.doc)
}
// extractResults parses search results from a DuckDuckGo results page.
//
// If the page is an anti-bot challenge ("Unfortunately, bots use DuckDuckGo
// too...") rather than results, returns ErrBlocked so callers can distinguish
// "blocked" from "no matches."
func extractResults(doc extractor.Node) ([]Result, error) {
var res []Result
if isBlocked(doc) {
return nil, ErrBlocked
}
err := doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
var r Result
links := n.Select(`a[href][target="_self"]`)
if len(links) == 0 {
return nil
}
var err error
r.URL, err = links[0].Attr(`href`)
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := n.Select("h2")
if len(titles) != 0 {
r.Title, err = titles[0].Text()
if err != nil {
slog.Warn("failed to get result title", "err", err)
}
}
descriptions := n.Select("span > span")
if len(descriptions) != 0 {
r.Description, err = descriptions[0].Text()
if err != nil {
slog.Warn("failed to get result description", "err", err)
}
}
res = append(res, r)
return nil
})
return res, err
}
// isBlocked reports whether the page is the DuckDuckGo anti-bot challenge
// modal (".anomaly-modal__title") rather than a normal results page.
func isBlocked(doc extractor.Node) bool {
return len(doc.Select(".anomaly-modal__title")) > 0
}
func (s searchPage) LoadMore() error {
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
slog.Info("clicking load more", "node", n)
return n.Click()
})
}
func (s searchPage) Close() error {
return s.doc.Close()
}