From 39453288ce2a1a3763b3904f834c26460004d32e Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Tue, 18 Mar 2025 02:42:50 -0400 Subject: [PATCH] Add OpenSearch and SearchPage functionality for DuckDuckGo Introduced the `OpenSearch` method and `SearchPage` interface to streamline search operations and allow for loading more results dynamically. Updated dependencies and modified the DuckDuckGo CLI to utilize these enhancements. --- go.mod | 14 ++--- sites/duckduckgo/cmd/duckduckgo/main.go | 24 ++++++--- sites/duckduckgo/duckduckgo.go | 15 ++++++ sites/duckduckgo/page.go | 68 +++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 sites/duckduckgo/page.go diff --git a/go.mod b/go.mod index e2cd586..2c11dbb 100644 --- a/go.mod +++ b/go.mod @@ -3,19 +3,19 @@ module gitea.stevedudenhoeffer.com/steve/go-extractor go 1.23.2 require ( - github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f - github.com/playwright-community/playwright-go v0.4802.0 + github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 + github.com/playwright-community/playwright-go v0.5001.0 github.com/urfave/cli/v3 v3.0.0-beta1 - golang.org/x/text v0.21.0 + golang.org/x/text v0.23.0 ) require ( - github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect - github.com/deckarep/golang-set/v2 v2.6.0 // indirect - github.com/go-jose/go-jose/v3 v3.0.3 // indirect + github.com/deckarep/golang-set/v2 v2.8.0 // indirect + github.com/go-jose/go-jose/v3 v3.0.4 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect - golang.org/x/net v0.32.0 // indirect + golang.org/x/net v0.37.0 // indirect ) diff --git a/sites/duckduckgo/cmd/duckduckgo/main.go b/sites/duckduckgo/cmd/duckduckgo/main.go index 5fb5d2b..196e911 100644 --- a/sites/duckduckgo/cmd/duckduckgo/main.go +++ b/sites/duckduckgo/cmd/duckduckgo/main.go @@ -3,11 +3,11 @@ package main import ( "context" "fmt" + "github.com/urfave/cli/v3" "io" "os" "strings" - - "github.com/urfave/cli/v3" + "time" "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo" @@ -58,6 +58,7 @@ func deferClose(cl io.Closer) { func main() { var flags []cli.Flag + flags = append(flags, browser.Flags...) flags = append(flags, Flags...) cli := &cli.Command{ @@ -81,13 +82,24 @@ func main() { return fmt.Errorf("failed to create browser: %w", err) } - res, err := c.Search(ctx, b, query) - + search, err := c.OpenSearch(ctx, b, query) if err != nil { - return fmt.Errorf("failed to search: %w", err) + return fmt.Errorf("failed to open search: %w", err) } - fmt.Println(res) + defer deferClose(search) + + res := search.GetResults() + fmt.Println("Results:", res) + + err = search.LoadMore() + if err != nil { + return fmt.Errorf("failed to load more: %w", err) + } + + time.Sleep(2 * time.Second) + res = search.GetResults() + fmt.Println("Results:", res) return nil }, diff --git a/sites/duckduckgo/duckduckgo.go b/sites/duckduckgo/duckduckgo.go index a9abf7d..bebe820 100644 --- a/sites/duckduckgo/duckduckgo.go +++ b/sites/duckduckgo/duckduckgo.go @@ -77,6 +77,21 @@ func deferClose(cl io.Closer) { } } +func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) { + u := c.ToSearchURL(query) + + slog.Info("searching", "url", u, "query", query, "config", c, "browser", b) + doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{}) + if err != nil { + if doc != nil { + _ = doc.Close() + } + return nil, fmt.Errorf("failed to open url: %w", err) + } + + return searchPage{doc}, nil +} + func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) { u := c.ToSearchURL(query) diff --git a/sites/duckduckgo/page.go b/sites/duckduckgo/page.go new file mode 100644 index 0000000..8d85daf --- /dev/null +++ b/sites/duckduckgo/page.go @@ -0,0 +1,68 @@ +package duckduckgo + +import ( + "fmt" + "gitea.stevedudenhoeffer.com/steve/go-extractor" + "io" + "log/slog" +) + +type SearchPage interface { + io.Closer + GetResults() []Result + LoadMore() error +} + +type searchPage struct { + doc extractor.Document +} + +func (s searchPage) GetResults() []Result { + var res []Result + var err error + + err = s.doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error { + var r Result + + links := n.Select(`a[href][target="_self"]`) + + if len(links) == 0 { + return nil + } + + r.URL, err = links[0].Attr(`href`) + + if err != nil { + return fmt.Errorf("failed to get link: %w", err) + } + + titles := n.Select("h2") + + if len(titles) != 0 { + r.Title, _ = titles[0].Text() + } + + descriptions := n.Select("span > span") + + if len(descriptions) != 0 { + r.Description, _ = descriptions[0].Text() + } + + res = append(res, r) + + return nil + }) + + return res +} + +func (s searchPage) LoadMore() error { + return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error { + slog.Info("clicking load more", "node", n) + return n.Click() + }) +} + +func (s searchPage) Close() error { + return s.doc.Close() +}