Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
39453288ce | |||
7c0e44a22f | |||
0f9f6c776d | |||
62cb6958fa |
14
go.mod
14
go.mod
@ -3,19 +3,19 @@ module gitea.stevedudenhoeffer.com/steve/go-extractor
|
|||||||
go 1.23.2
|
go 1.23.2
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
|
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
|
||||||
github.com/playwright-community/playwright-go v0.4802.0
|
github.com/playwright-community/playwright-go v0.5001.0
|
||||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||||
golang.org/x/text v0.21.0
|
golang.org/x/text v0.23.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
|
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
|
||||||
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
|
github.com/deckarep/golang-set/v2 v2.8.0 // indirect
|
||||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
github.com/go-jose/go-jose/v3 v3.0.4 // indirect
|
||||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||||
github.com/go-stack/stack v1.8.1 // indirect
|
github.com/go-stack/stack v1.8.1 // indirect
|
||||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||||
golang.org/x/net v0.32.0 // indirect
|
golang.org/x/net v0.37.0 // indirect
|
||||||
)
|
)
|
||||||
|
33
node.go
33
node.go
@ -1,6 +1,9 @@
|
|||||||
package extractor
|
package extractor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -17,6 +20,9 @@ type Node interface {
|
|||||||
SelectFirst(selector string) Node
|
SelectFirst(selector string) Node
|
||||||
|
|
||||||
ForEach(selector string, fn func(Node) error) error
|
ForEach(selector string, fn func(Node) error) error
|
||||||
|
|
||||||
|
SetHidden(val bool) error
|
||||||
|
SetAttribute(name, value string) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type node struct {
|
type node struct {
|
||||||
@ -79,3 +85,30 @@ func (n node) ForEach(selector string, fn func(Node) error) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (n node) SetHidden(val bool) error {
|
||||||
|
visible, err := n.locator.IsVisible()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error checking visibility: %w", err)
|
||||||
|
}
|
||||||
|
if visible == !val {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the hidden property
|
||||||
|
_, err = n.locator.Evaluate(fmt.Sprintf(`(element) => element.hidden = %t;`, val), nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error setting hidden property: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func escapeJavaScript(s string) string {
|
||||||
|
return strings.Replace(strings.Replace(s, "\\", "\\\\", -1), "'", "\\'", -1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (n node) SetAttribute(name, value string) error {
|
||||||
|
_, err := n.locator.Evaluate(fmt.Sprintf(`(element) => element.setAttribute('%s', '%s');`, escapeJavaScript(name), escapeJavaScript(value)), nil)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
@ -36,6 +36,10 @@ const (
|
|||||||
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type Size struct {
|
||||||
|
Width int
|
||||||
|
Height int
|
||||||
|
}
|
||||||
type PlayWrightBrowserOptions struct {
|
type PlayWrightBrowserOptions struct {
|
||||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
||||||
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
||||||
@ -46,6 +50,9 @@ type PlayWrightBrowserOptions struct {
|
|||||||
CookieJar
|
CookieJar
|
||||||
|
|
||||||
ShowBrowser bool // If false, browser will be headless
|
ShowBrowser bool // If false, browser will be headless
|
||||||
|
|
||||||
|
Dimensions Size
|
||||||
|
DarkMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
||||||
@ -76,6 +83,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|||||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
||||||
Browser: PlayWrightBrowserSelectionFirefox,
|
Browser: PlayWrightBrowserSelectionFirefox,
|
||||||
Timeout: &thirtySeconds,
|
Timeout: &thirtySeconds,
|
||||||
|
DarkMode: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, o := range opts {
|
for _, o := range opts {
|
||||||
@ -91,6 +99,13 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|||||||
if o.CookieJar != nil {
|
if o.CookieJar != nil {
|
||||||
opt.CookieJar = o.CookieJar
|
opt.CookieJar = o.CookieJar
|
||||||
}
|
}
|
||||||
|
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||||
|
opt.Dimensions = o.Dimensions
|
||||||
|
}
|
||||||
|
if o.DarkMode {
|
||||||
|
opt.DarkMode = true
|
||||||
|
}
|
||||||
|
|
||||||
opt.ShowBrowser = o.ShowBrowser
|
opt.ShowBrowser = o.ShowBrowser
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -133,8 +148,26 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var viewport *playwright.Size
|
||||||
|
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||||
|
viewport = &playwright.Size{
|
||||||
|
Width: opt.Dimensions.Width,
|
||||||
|
Height: opt.Dimensions.Height,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var scheme *playwright.ColorScheme
|
||||||
|
|
||||||
|
if opt.DarkMode {
|
||||||
|
scheme = playwright.ColorSchemeDark
|
||||||
|
} else {
|
||||||
|
scheme = playwright.ColorSchemeNoPreference
|
||||||
|
}
|
||||||
|
|
||||||
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||||
UserAgent: playwright.String(opt.UserAgent),
|
UserAgent: playwright.String(opt.UserAgent),
|
||||||
|
Viewport: viewport,
|
||||||
|
ColorScheme: scheme,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -3,11 +3,11 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/urfave/cli/v3"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
"github.com/urfave/cli/v3"
|
|
||||||
|
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||||
@ -58,6 +58,7 @@ func deferClose(cl io.Closer) {
|
|||||||
func main() {
|
func main() {
|
||||||
var flags []cli.Flag
|
var flags []cli.Flag
|
||||||
|
|
||||||
|
flags = append(flags, browser.Flags...)
|
||||||
flags = append(flags, Flags...)
|
flags = append(flags, Flags...)
|
||||||
|
|
||||||
cli := &cli.Command{
|
cli := &cli.Command{
|
||||||
@ -81,13 +82,24 @@ func main() {
|
|||||||
return fmt.Errorf("failed to create browser: %w", err)
|
return fmt.Errorf("failed to create browser: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := c.Search(ctx, b, query)
|
search, err := c.OpenSearch(ctx, b, query)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to search: %w", err)
|
return fmt.Errorf("failed to open search: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println(res)
|
defer deferClose(search)
|
||||||
|
|
||||||
|
res := search.GetResults()
|
||||||
|
fmt.Println("Results:", res)
|
||||||
|
|
||||||
|
err = search.LoadMore()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to load more: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
res = search.GetResults()
|
||||||
|
fmt.Println("Results:", res)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
|
@ -77,6 +77,21 @@ func deferClose(cl io.Closer) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
|
||||||
|
u := c.ToSearchURL(query)
|
||||||
|
|
||||||
|
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
||||||
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
|
if err != nil {
|
||||||
|
if doc != nil {
|
||||||
|
_ = doc.Close()
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return searchPage{doc}, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
||||||
u := c.ToSearchURL(query)
|
u := c.ToSearchURL(query)
|
||||||
|
|
||||||
|
68
sites/duckduckgo/page.go
Normal file
68
sites/duckduckgo/page.go
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
package duckduckgo
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SearchPage interface {
|
||||||
|
io.Closer
|
||||||
|
GetResults() []Result
|
||||||
|
LoadMore() error
|
||||||
|
}
|
||||||
|
|
||||||
|
type searchPage struct {
|
||||||
|
doc extractor.Document
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s searchPage) GetResults() []Result {
|
||||||
|
var res []Result
|
||||||
|
var err error
|
||||||
|
|
||||||
|
err = s.doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
|
||||||
|
var r Result
|
||||||
|
|
||||||
|
links := n.Select(`a[href][target="_self"]`)
|
||||||
|
|
||||||
|
if len(links) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
r.URL, err = links[0].Attr(`href`)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to get link: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
titles := n.Select("h2")
|
||||||
|
|
||||||
|
if len(titles) != 0 {
|
||||||
|
r.Title, _ = titles[0].Text()
|
||||||
|
}
|
||||||
|
|
||||||
|
descriptions := n.Select("span > span")
|
||||||
|
|
||||||
|
if len(descriptions) != 0 {
|
||||||
|
r.Description, _ = descriptions[0].Text()
|
||||||
|
}
|
||||||
|
|
||||||
|
res = append(res, r)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s searchPage) LoadMore() error {
|
||||||
|
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
|
||||||
|
slog.Info("clicking load more", "node", n)
|
||||||
|
return n.Click()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s searchPage) Close() error {
|
||||||
|
return s.doc.Close()
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user