Steve Dudenhoeffer
e8de488d2b
Replaced the overly complex CSS selector with a simplified "h2" selector for extracting titles. This change improves maintainability and ensures accurate title extraction from the updated DOM structure.
127 lines
2.2 KiB
Go
127 lines
2.2 KiB
Go
package duckduckgo
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/url"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
)
|
|
|
|
type SafeSearch int
|
|
|
|
const (
|
|
SafeSearchOn SafeSearch = 1
|
|
SafeSearchModerate SafeSearch = -1
|
|
SafeSearchOff SafeSearch = -2
|
|
)
|
|
|
|
type Config struct {
|
|
// SafeSearch is the safe-search level to use. If empty, SafeSearchOff will be used.
|
|
SafeSearch SafeSearch
|
|
|
|
// Region is the region to use for the search engine.
|
|
// See: https://duckduckgo.com/duckduckgo-help-pages/settings/params/ for more values
|
|
Region string
|
|
}
|
|
|
|
func (c Config) validate() Config {
|
|
if c.SafeSearch == 0 {
|
|
c.SafeSearch = SafeSearchOff
|
|
}
|
|
|
|
return c
|
|
}
|
|
func (c Config) ToSearchURL(query string) *url.URL {
|
|
c = c.validate()
|
|
|
|
res, _ := url.Parse("https://duckduckgo.com/")
|
|
|
|
var vals = res.Query()
|
|
|
|
switch c.SafeSearch {
|
|
case SafeSearchOn:
|
|
vals.Set("kp", "1")
|
|
case SafeSearchModerate:
|
|
vals.Set("kp", "-1")
|
|
case SafeSearchOff:
|
|
vals.Set("kp", "-2")
|
|
}
|
|
|
|
if c.Region != "" {
|
|
vals.Set("kl", c.Region)
|
|
}
|
|
|
|
vals.Set("q", query)
|
|
|
|
res.RawQuery = vals.Encode()
|
|
|
|
return res
|
|
}
|
|
|
|
var DefaultConfig = Config{
|
|
SafeSearch: SafeSearchOff,
|
|
}
|
|
|
|
type Result struct {
|
|
URL string
|
|
Title string
|
|
Description string
|
|
}
|
|
|
|
func deferClose(cl io.Closer) {
|
|
if cl != nil {
|
|
_ = cl.Close()
|
|
}
|
|
}
|
|
|
|
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
|
u := c.ToSearchURL(query)
|
|
|
|
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
|
defer deferClose(doc)
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open url: %w", err)
|
|
}
|
|
|
|
var res []Result
|
|
|
|
err = doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
|
|
var r Result
|
|
|
|
links := n.Select(`a[href][target="_self"]`)
|
|
|
|
if len(links) == 0 {
|
|
return nil
|
|
}
|
|
|
|
r.URL, err = links[0].Attr(`href`)
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get link: %w", err)
|
|
}
|
|
|
|
titles := n.Select("h2")
|
|
|
|
if len(titles) != 0 {
|
|
r.Title, _ = titles[0].Text()
|
|
}
|
|
|
|
descriptions := n.Select("span > span")
|
|
|
|
if len(descriptions) != 0 {
|
|
r.Description, _ = descriptions[0].Text()
|
|
}
|
|
|
|
res = append(res, r)
|
|
|
|
return nil
|
|
})
|
|
|
|
return res, nil
|
|
}
|