Compare commits

...

4 Commits

Author SHA1 Message Date
39453288ce Add OpenSearch and SearchPage functionality for DuckDuckGo
Introduced the `OpenSearch` method and `SearchPage` interface to streamline search operations and allow for loading more results dynamically. Updated dependencies and modified the DuckDuckGo CLI to utilize these enhancements.
2025-03-18 02:42:50 -04:00
7c0e44a22f Add viewport dimensions and dark mode support
This commit introduces optional viewport dimensions and dark mode support to the PlayWrightBrowserOptions struct and its usage. It ensures more control over browser display settings and improves flexibility when configuring browser contexts. Additionally, visibility checking logic in SetHidden was refined to avoid redundant operations.
2025-03-15 00:46:02 -04:00
0f9f6c776d Rename SetVisible to SetHidden for clearer semantic meaning
The method and its implementation now align with setting an element's "hidden" property instead of "visible." This change improves code clarity and consistency with expected behavior.
2025-03-03 23:39:37 -05:00
62cb6958fa Add SetVisible and SetAttribute methods to Node interface
This commit introduces two new methods, SetVisible and SetAttribute, to the Node interface. These methods allow toggling element visibility and setting attributes dynamically. Additionally, a helper function, escapeJavaScript, was added to ensure proper escaping of JavaScript strings.
2025-03-03 23:31:51 -05:00
6 changed files with 175 additions and 14 deletions

14
go.mod
View File

@ -3,19 +3,19 @@ module gitea.stevedudenhoeffer.com/steve/go-extractor
go 1.23.2
require (
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
github.com/playwright-community/playwright-go v0.4802.0
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
github.com/playwright-community/playwright-go v0.5001.0
github.com/urfave/cli/v3 v3.0.0-beta1
golang.org/x/text v0.21.0
golang.org/x/text v0.23.0
)
require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/deckarep/golang-set/v2 v2.8.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.4 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
golang.org/x/net v0.32.0 // indirect
golang.org/x/net v0.37.0 // indirect
)

33
node.go
View File

@ -1,6 +1,9 @@
package extractor
import (
"fmt"
"strings"
"github.com/playwright-community/playwright-go"
)
@ -17,6 +20,9 @@ type Node interface {
SelectFirst(selector string) Node
ForEach(selector string, fn func(Node) error) error
SetHidden(val bool) error
SetAttribute(name, value string) error
}
type node struct {
@ -79,3 +85,30 @@ func (n node) ForEach(selector string, fn func(Node) error) error {
return nil
}
func (n node) SetHidden(val bool) error {
visible, err := n.locator.IsVisible()
if err != nil {
return fmt.Errorf("error checking visibility: %w", err)
}
if visible == !val {
return nil
}
// Set the hidden property
_, err = n.locator.Evaluate(fmt.Sprintf(`(element) => element.hidden = %t;`, val), nil)
if err != nil {
return fmt.Errorf("error setting hidden property: %w", err)
}
return nil
}
func escapeJavaScript(s string) string {
return strings.Replace(strings.Replace(s, "\\", "\\\\", -1), "'", "\\'", -1)
}
func (n node) SetAttribute(name, value string) error {
_, err := n.locator.Evaluate(fmt.Sprintf(`(element) => element.setAttribute('%s', '%s');`, escapeJavaScript(name), escapeJavaScript(value)), nil)
return err
}

View File

@ -36,6 +36,10 @@ const (
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
)
type Size struct {
Width int
Height int
}
type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
@ -46,6 +50,9 @@ type PlayWrightBrowserOptions struct {
CookieJar
ShowBrowser bool // If false, browser will be headless
Dimensions Size
DarkMode bool
}
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
@ -76,6 +83,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
Browser: PlayWrightBrowserSelectionFirefox,
Timeout: &thirtySeconds,
DarkMode: false,
}
for _, o := range opts {
@ -91,6 +99,13 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
if o.CookieJar != nil {
opt.CookieJar = o.CookieJar
}
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
opt.Dimensions = o.Dimensions
}
if o.DarkMode {
opt.DarkMode = true
}
opt.ShowBrowser = o.ShowBrowser
}
@ -133,8 +148,26 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
return nil, err
}
var viewport *playwright.Size
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
viewport = &playwright.Size{
Width: opt.Dimensions.Width,
Height: opt.Dimensions.Height,
}
}
var scheme *playwright.ColorScheme
if opt.DarkMode {
scheme = playwright.ColorSchemeDark
} else {
scheme = playwright.ColorSchemeNoPreference
}
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
UserAgent: playwright.String(opt.UserAgent),
UserAgent: playwright.String(opt.UserAgent),
Viewport: viewport,
ColorScheme: scheme,
})
if err != nil {
return nil, err

View File

@ -3,11 +3,11 @@ package main
import (
"context"
"fmt"
"github.com/urfave/cli/v3"
"io"
"os"
"strings"
"github.com/urfave/cli/v3"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
@ -58,6 +58,7 @@ func deferClose(cl io.Closer) {
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
@ -81,13 +82,24 @@ func main() {
return fmt.Errorf("failed to create browser: %w", err)
}
res, err := c.Search(ctx, b, query)
search, err := c.OpenSearch(ctx, b, query)
if err != nil {
return fmt.Errorf("failed to search: %w", err)
return fmt.Errorf("failed to open search: %w", err)
}
fmt.Println(res)
defer deferClose(search)
res := search.GetResults()
fmt.Println("Results:", res)
err = search.LoadMore()
if err != nil {
return fmt.Errorf("failed to load more: %w", err)
}
time.Sleep(2 * time.Second)
res = search.GetResults()
fmt.Println("Results:", res)
return nil
},

View File

@ -77,6 +77,21 @@ func deferClose(cl io.Closer) {
}
}
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
u := c.ToSearchURL(query)
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return searchPage{doc}, nil
}
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
u := c.ToSearchURL(query)

68
sites/duckduckgo/page.go Normal file
View File

@ -0,0 +1,68 @@
package duckduckgo
import (
"fmt"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"io"
"log/slog"
)
type SearchPage interface {
io.Closer
GetResults() []Result
LoadMore() error
}
type searchPage struct {
doc extractor.Document
}
func (s searchPage) GetResults() []Result {
var res []Result
var err error
err = s.doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
var r Result
links := n.Select(`a[href][target="_self"]`)
if len(links) == 0 {
return nil
}
r.URL, err = links[0].Attr(`href`)
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := n.Select("h2")
if len(titles) != 0 {
r.Title, _ = titles[0].Text()
}
descriptions := n.Select("span > span")
if len(descriptions) != 0 {
r.Description, _ = descriptions[0].Text()
}
res = append(res, r)
return nil
})
return res
}
func (s searchPage) LoadMore() error {
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
slog.Info("clicking load more", "node", n)
return n.Click()
})
}
func (s searchPage) Close() error {
return s.doc.Close()
}