841f1ec2bf
DuckDuckGo intermittently serves a CAPTCHA modal ("Unfortunately, bots
use DuckDuckGo too...") instead of search results. The result selector
matches zero elements on that page, so callers used to get
([]Result{}, nil) — silent empty results that look like "no matches."
Detect the challenge via the BEM class .anomaly-modal__title and return
a typed ErrBlocked so callers can distinguish blocked from no-results
and react (retry, fallback to another engine, surface to user).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
95 lines
1.9 KiB
Go
95 lines
1.9 KiB
Go
package duckduckgo
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
)
|
|
|
|
type SearchPage interface {
|
|
io.Closer
|
|
GetResults() ([]Result, error)
|
|
LoadMore() error
|
|
}
|
|
|
|
type searchPage struct {
|
|
doc extractor.Document
|
|
}
|
|
|
|
func (s searchPage) GetResults() ([]Result, error) {
|
|
return extractResults(s.doc)
|
|
}
|
|
|
|
// extractResults parses search results from a DuckDuckGo results page.
|
|
//
|
|
// If the page is an anti-bot challenge ("Unfortunately, bots use DuckDuckGo
|
|
// too...") rather than results, returns ErrBlocked so callers can distinguish
|
|
// "blocked" from "no matches."
|
|
func extractResults(doc extractor.Node) ([]Result, error) {
|
|
var res []Result
|
|
|
|
if isBlocked(doc) {
|
|
return nil, ErrBlocked
|
|
}
|
|
|
|
err := doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
|
|
var r Result
|
|
|
|
links := n.Select(`a[href][target="_self"]`)
|
|
|
|
if len(links) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var err error
|
|
r.URL, err = links[0].Attr(`href`)
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get link: %w", err)
|
|
}
|
|
|
|
titles := n.Select("h2")
|
|
|
|
if len(titles) != 0 {
|
|
r.Title, err = titles[0].Text()
|
|
if err != nil {
|
|
slog.Warn("failed to get result title", "err", err)
|
|
}
|
|
}
|
|
|
|
descriptions := n.Select("span > span")
|
|
|
|
if len(descriptions) != 0 {
|
|
r.Description, err = descriptions[0].Text()
|
|
if err != nil {
|
|
slog.Warn("failed to get result description", "err", err)
|
|
}
|
|
}
|
|
|
|
res = append(res, r)
|
|
|
|
return nil
|
|
})
|
|
|
|
return res, err
|
|
}
|
|
|
|
// isBlocked reports whether the page is the DuckDuckGo anti-bot challenge
|
|
// modal (".anomaly-modal__title") rather than a normal results page.
|
|
func isBlocked(doc extractor.Node) bool {
|
|
return len(doc.Select(".anomaly-modal__title")) > 0
|
|
}
|
|
|
|
func (s searchPage) LoadMore() error {
|
|
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
|
|
slog.Info("clicking load more", "node", n)
|
|
return n.Click()
|
|
})
|
|
}
|
|
|
|
func (s searchPage) Close() error {
|
|
return s.doc.Close()
|
|
}
|