Files
go-extractor/sites/duckduckgo/page.go
Steve Dudenhoeffer a9711ce904
All checks were successful
CI / vet (pull_request) Successful in 1m10s
CI / build (pull_request) Successful in 1m21s
CI / test (pull_request) Successful in 1m28s
fix: surface parsing errors instead of silently discarding them
Return errors for required fields (ID, price) and log warnings for
optional fields (title, description, unit price) across all site
extractors instead of silently discarding them with _ =.

Closes #24
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 16:31:56 +00:00

81 lines
1.5 KiB
Go

package duckduckgo
import (
"fmt"
"io"
"log/slog"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type SearchPage interface {
io.Closer
GetResults() ([]Result, error)
LoadMore() error
}
type searchPage struct {
doc extractor.Document
}
func (s searchPage) GetResults() ([]Result, error) {
return extractResults(s.doc)
}
// extractResults parses search results from a DuckDuckGo results page.
func extractResults(doc extractor.Node) ([]Result, error) {
var res []Result
err := doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
var r Result
links := n.Select(`a[href][target="_self"]`)
if len(links) == 0 {
return nil
}
var err error
r.URL, err = links[0].Attr(`href`)
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := n.Select("h2")
if len(titles) != 0 {
r.Title, err = titles[0].Text()
if err != nil {
slog.Warn("failed to get result title", "err", err)
}
}
descriptions := n.Select("span > span")
if len(descriptions) != 0 {
r.Description, err = descriptions[0].Text()
if err != nil {
slog.Warn("failed to get result description", "err", err)
}
}
res = append(res, r)
return nil
})
return res, err
}
func (s searchPage) LoadMore() error {
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
slog.Info("clicking load more", "node", n)
return n.Click()
})
}
func (s searchPage) Close() error {
return s.doc.Close()
}