Add OpenSearch and SearchPage functionality for DuckDuckGo

Introduced the `OpenSearch` method and `SearchPage` interface to streamline search operations and allow for loading more results dynamically. Updated dependencies and modified the DuckDuckGo CLI to utilize these enhancements.
This commit is contained in:
Steve Dudenhoeffer 2025-03-18 02:42:50 -04:00
parent 7c0e44a22f
commit 39453288ce
4 changed files with 108 additions and 13 deletions

14
go.mod
View File

@ -3,19 +3,19 @@ module gitea.stevedudenhoeffer.com/steve/go-extractor
go 1.23.2 go 1.23.2
require ( require (
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
github.com/playwright-community/playwright-go v0.4802.0 github.com/playwright-community/playwright-go v0.5001.0
github.com/urfave/cli/v3 v3.0.0-beta1 github.com/urfave/cli/v3 v3.0.0-beta1
golang.org/x/text v0.21.0 golang.org/x/text v0.23.0
) )
require ( require (
github.com/andybalholm/cascadia v1.3.2 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect github.com/go-jose/go-jose/v3 v3.0.4 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-stack/stack v1.8.1 // indirect github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
golang.org/x/net v0.32.0 // indirect golang.org/x/net v0.37.0 // indirect
) )

View File

@ -3,11 +3,11 @@ package main
import ( import (
"context" "context"
"fmt" "fmt"
"github.com/urfave/cli/v3"
"io" "io"
"os" "os"
"strings" "strings"
"time"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
@ -58,6 +58,7 @@ func deferClose(cl io.Closer) {
func main() { func main() {
var flags []cli.Flag var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...) flags = append(flags, Flags...)
cli := &cli.Command{ cli := &cli.Command{
@ -81,13 +82,24 @@ func main() {
return fmt.Errorf("failed to create browser: %w", err) return fmt.Errorf("failed to create browser: %w", err)
} }
res, err := c.Search(ctx, b, query) search, err := c.OpenSearch(ctx, b, query)
if err != nil { if err != nil {
return fmt.Errorf("failed to search: %w", err) return fmt.Errorf("failed to open search: %w", err)
} }
fmt.Println(res) defer deferClose(search)
res := search.GetResults()
fmt.Println("Results:", res)
err = search.LoadMore()
if err != nil {
return fmt.Errorf("failed to load more: %w", err)
}
time.Sleep(2 * time.Second)
res = search.GetResults()
fmt.Println("Results:", res)
return nil return nil
}, },

View File

@ -77,6 +77,21 @@ func deferClose(cl io.Closer) {
} }
} }
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
u := c.ToSearchURL(query)
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return searchPage{doc}, nil
}
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) { func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
u := c.ToSearchURL(query) u := c.ToSearchURL(query)

68
sites/duckduckgo/page.go Normal file
View File

@ -0,0 +1,68 @@
package duckduckgo
import (
"fmt"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"io"
"log/slog"
)
type SearchPage interface {
io.Closer
GetResults() []Result
LoadMore() error
}
type searchPage struct {
doc extractor.Document
}
func (s searchPage) GetResults() []Result {
var res []Result
var err error
err = s.doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
var r Result
links := n.Select(`a[href][target="_self"]`)
if len(links) == 0 {
return nil
}
r.URL, err = links[0].Attr(`href`)
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := n.Select("h2")
if len(titles) != 0 {
r.Title, _ = titles[0].Text()
}
descriptions := n.Select("span > span")
if len(descriptions) != 0 {
r.Description, _ = descriptions[0].Text()
}
res = append(res, r)
return nil
})
return res
}
func (s searchPage) LoadMore() error {
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
slog.Info("clicking load more", "node", n)
return n.Click()
})
}
func (s searchPage) Close() error {
return s.doc.Close()
}