Compare commits

..

No commits in common. "main" and "eggwatch" have entirely different histories.

10 changed files with 16 additions and 533 deletions

16
go.mod
View File

@ -3,19 +3,19 @@ module gitea.stevedudenhoeffer.com/steve/go-extractor
go 1.23.2 go 1.23.2
require ( require (
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
github.com/playwright-community/playwright-go v0.5001.0 github.com/playwright-community/playwright-go v0.4802.0
github.com/urfave/cli/v3 v3.0.0-beta1
golang.org/x/text v0.23.0
) )
require ( require (
github.com/andybalholm/cascadia v1.3.3 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/deckarep/golang-set/v2 v2.8.0 // indirect github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.4 // indirect github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-stack/stack v1.8.1 // indirect github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
golang.org/x/net v0.37.0 // indirect github.com/urfave/cli/v3 v3.0.0-beta1 // indirect
golang.org/x/net v0.32.0 // indirect
golang.org/x/text v0.21.0 // indirect
) )

33
node.go
View File

@ -1,9 +1,6 @@
package extractor package extractor
import ( import (
"fmt"
"strings"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
@ -20,9 +17,6 @@ type Node interface {
SelectFirst(selector string) Node SelectFirst(selector string) Node
ForEach(selector string, fn func(Node) error) error ForEach(selector string, fn func(Node) error) error
SetHidden(val bool) error
SetAttribute(name, value string) error
} }
type node struct { type node struct {
@ -85,30 +79,3 @@ func (n node) ForEach(selector string, fn func(Node) error) error {
return nil return nil
} }
func (n node) SetHidden(val bool) error {
visible, err := n.locator.IsVisible()
if err != nil {
return fmt.Errorf("error checking visibility: %w", err)
}
if visible == !val {
return nil
}
// Set the hidden property
_, err = n.locator.Evaluate(fmt.Sprintf(`(element) => element.hidden = %t;`, val), nil)
if err != nil {
return fmt.Errorf("error setting hidden property: %w", err)
}
return nil
}
func escapeJavaScript(s string) string {
return strings.Replace(strings.Replace(s, "\\", "\\\\", -1), "'", "\\'", -1)
}
func (n node) SetAttribute(name, value string) error {
_, err := n.locator.Evaluate(fmt.Sprintf(`(element) => element.setAttribute('%s', '%s');`, escapeJavaScript(name), escapeJavaScript(value)), nil)
return err
}

View File

@ -36,10 +36,6 @@ const (
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
) )
type Size struct {
Width int
Height int
}
type PlayWrightBrowserOptions struct { type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0" UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
Browser PlayWrightBrowserSelection // If unset defaults to Firefox. Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
@ -50,9 +46,6 @@ type PlayWrightBrowserOptions struct {
CookieJar CookieJar
ShowBrowser bool // If false, browser will be headless ShowBrowser bool // If false, browser will be headless
Dimensions Size
DarkMode bool
} }
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
@ -83,7 +76,6 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
Browser: PlayWrightBrowserSelectionFirefox, Browser: PlayWrightBrowserSelectionFirefox,
Timeout: &thirtySeconds, Timeout: &thirtySeconds,
DarkMode: false,
} }
for _, o := range opts { for _, o := range opts {
@ -99,13 +91,6 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
if o.CookieJar != nil { if o.CookieJar != nil {
opt.CookieJar = o.CookieJar opt.CookieJar = o.CookieJar
} }
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
opt.Dimensions = o.Dimensions
}
if o.DarkMode {
opt.DarkMode = true
}
opt.ShowBrowser = o.ShowBrowser opt.ShowBrowser = o.ShowBrowser
} }
@ -148,26 +133,8 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
return nil, err return nil, err
} }
var viewport *playwright.Size
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
viewport = &playwright.Size{
Width: opt.Dimensions.Width,
Height: opt.Dimensions.Height,
}
}
var scheme *playwright.ColorScheme
if opt.DarkMode {
scheme = playwright.ColorSchemeDark
} else {
scheme = playwright.ColorSchemeNoPreference
}
c, err := browser.NewContext(playwright.BrowserNewContextOptions{ c, err := browser.NewContext(playwright.BrowserNewContextOptions{
UserAgent: playwright.String(opt.UserAgent), UserAgent: playwright.String(opt.UserAgent),
Viewport: viewport,
ColorScheme: scheme,
}) })
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -1,81 +0,0 @@
package aislegopher
import (
"context"
"errors"
"fmt"
"io"
"net/url"
"strconv"
"strings"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
}
var DefaultConfig = Config{}
var (
ErrInvalidURL = errors.New("invalid url")
)
type Item struct {
ID int
Name string
Price float64
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
return DefaultConfig.GetItemFromURL(ctx, b, u)
}
func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
res := Item{}
// the url will be in the format of aislegopher.com/p/slug/id
// we need to parse the slug and id from the url
a := strings.Split(u.Path, "/")
if len(a) != 4 {
return res, ErrInvalidURL
}
if a[1] != "p" {
return res, ErrInvalidURL
}
if u.Host != "aislegopher.com" && u.Host != "www.aislegopher.com" {
return res, ErrInvalidURL
}
res.ID, _ = strconv.Atoi(a[3])
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
defer deferClose(doc)
if err != nil {
return res, fmt.Errorf("failed to open page: %w", err)
}
names := doc.Select("h2.h4")
if len(names) > 0 {
res.Name, _ = names[0].Text()
}
prices := doc.Select("h4.h2")
if len(prices) > 0 {
priceStr, _ := prices[0].Text()
priceStr = strings.ReplaceAll(priceStr, "$", "")
priceStr = strings.TrimSpace(priceStr)
res.Price, _ = strconv.ParseFloat(priceStr, 64)
}
return res, nil
}

View File

@ -1,77 +0,0 @@
package main
import (
"context"
"fmt"
"io"
"net/url"
"os"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
"github.com/urfave/cli/v3"
)
type AisleGopherFlags []cli.Flag
var Flags = AisleGopherFlags{}
func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
res := aislegopher.DefaultConfig
return res
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "aislegopher",
Usage: "AisleGopher is a tool for extracting data from aislegopher.com",
Flags: flags,
Action: func(ctx context.Context, c *cli.Command) error {
cfg := Flags.ToConfig(c)
b, err := browser.FromCommand(ctx, c)
if err != nil {
return fmt.Errorf("failed to create browser: %w", err)
}
defer deferClose(b)
arg := c.Args().First()
if arg == "" {
return fmt.Errorf("url is required")
}
u, err := url.Parse(arg)
if err != nil {
return fmt.Errorf("failed to parse url: %w", err)
}
data, err := cfg.GetItemFromURL(ctx, b, u)
if err != nil {
return fmt.Errorf("failed to get item from url: %w", err)
}
fmt.Printf("Item: %+v\n", data)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -3,13 +3,14 @@ package main
import ( import (
"context" "context"
"fmt" "fmt"
"github.com/urfave/cli/v3"
"io" "io"
"os" "os"
"strings" "strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
) )
@ -58,7 +59,6 @@ func deferClose(cl io.Closer) {
func main() { func main() {
var flags []cli.Flag var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...) flags = append(flags, Flags...)
cli := &cli.Command{ cli := &cli.Command{
@ -82,24 +82,13 @@ func main() {
return fmt.Errorf("failed to create browser: %w", err) return fmt.Errorf("failed to create browser: %w", err)
} }
search, err := c.OpenSearch(ctx, b, query) res, err := c.Search(ctx, b, query)
if err != nil { if err != nil {
return fmt.Errorf("failed to open search: %w", err) return fmt.Errorf("failed to search: %w", err)
} }
defer deferClose(search) fmt.Println(res)
res := search.GetResults()
fmt.Println("Results:", res)
err = search.LoadMore()
if err != nil {
return fmt.Errorf("failed to load more: %w", err)
}
time.Sleep(2 * time.Second)
res = search.GetResults()
fmt.Println("Results:", res)
return nil return nil
}, },

View File

@ -77,21 +77,6 @@ func deferClose(cl io.Closer) {
} }
} }
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
u := c.ToSearchURL(query)
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return searchPage{doc}, nil
}
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) { func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
u := c.ToSearchURL(query) u := c.ToSearchURL(query)

View File

@ -1,68 +0,0 @@
package duckduckgo
import (
"fmt"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"io"
"log/slog"
)
type SearchPage interface {
io.Closer
GetResults() []Result
LoadMore() error
}
type searchPage struct {
doc extractor.Document
}
func (s searchPage) GetResults() []Result {
var res []Result
var err error
err = s.doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
var r Result
links := n.Select(`a[href][target="_self"]`)
if len(links) == 0 {
return nil
}
r.URL, err = links[0].Attr(`href`)
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := n.Select("h2")
if len(titles) != 0 {
r.Title, _ = titles[0].Text()
}
descriptions := n.Select("span > span")
if len(descriptions) != 0 {
r.Description, _ = descriptions[0].Text()
}
res = append(res, r)
return nil
})
return res
}
func (s searchPage) LoadMore() error {
return s.doc.ForEach(`button#more-results`, func(n extractor.Node) error {
slog.Info("clicking load more", "node", n)
return n.Click()
})
}
func (s searchPage) Close() error {
return s.doc.Close()
}

View File

@ -1,81 +0,0 @@
package main
import (
"context"
"fmt"
"io"
"net/url"
"os"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
)
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
type WegmansFlags []cli.Flag
var Flags = WegmansFlags{}
func (f WegmansFlags) ToConfig(_ *cli.Command) wegmans.Config {
var res = wegmans.DefaultConfig
return res
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
app := &cli.Command{
Name: "wegmans",
Usage: "Search Wegmans",
Flags: flags,
Action: func(ctx context.Context, cmd *cli.Command) error {
cfg := Flags.ToConfig(cmd)
b, err := browser.FromCommand(ctx, cmd)
defer deferClose(b)
if err != nil {
return fmt.Errorf("error creating browser: %w", err)
}
arg := cmd.Args().First()
if arg == "" {
return fmt.Errorf("url is required")
}
u, err := url.Parse(arg)
if err != nil {
return fmt.Errorf("failed to parse url: %w", err)
}
item, err := cfg.GetItemPrice(ctx, b, u)
if err != nil {
return fmt.Errorf("failed to get item price: %w", err)
}
fmt.Println(item)
return nil
},
}
err := app.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -1,118 +0,0 @@
package wegmans
import (
"context"
"errors"
"io"
"net/url"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
}
var DefaultConfig = Config{}
var ErrNilBrowser = errors.New("browser is nil")
var ErrNilURL = errors.New("url is nil")
var ErrInvalidURL = errors.New("invalid url")
type Item struct {
ID int
Name string
Price float64
UnitPrice float64
Unit string
}
func deferClose(c io.Closer) {
if c != nil {
_ = c.Close()
}
}
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
if b == nil {
return Item{}, ErrNilBrowser
}
if u == nil {
return Item{}, ErrNilURL
}
// urls in the format of:
// https://shop.wegmans.com/product/24921[/wegmans-frozen-thin-crust-uncured-pepperoni-pizza]
// (the slug is optional)
// get the product ID
a := strings.Split(u.Path, "/")
if len(a) < 3 {
return Item{}, ErrInvalidURL
}
if a[1] != "product" {
return Item{}, ErrInvalidURL
}
id, _ := strconv.Atoi(a[2])
if id == 0 {
return Item{}, ErrInvalidURL
}
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
defer deferClose(doc)
if err != nil {
return Item{}, err
}
timeout := 15 * time.Second
_ = doc.WaitForNetworkIdle(&timeout)
res := Item{
ID: id,
}
titles := doc.Select("h1[data-test]")
if len(titles) != 0 {
res.Name, _ = titles[0].Text()
}
prices := doc.Select("span[data-test=\"amount\"] span:nth-child(1)")
if len(prices) != 0 {
priceStr, _ := prices[0].Text()
priceStr = strings.ReplaceAll(priceStr, "$", "")
priceStr = strings.ReplaceAll(priceStr, ",", "")
price, _ := strconv.ParseFloat(priceStr, 64)
res.Price = price
}
unitPrices := doc.Select(`span[data-test="per-unit-price"]`)
if len(unitPrices) != 0 {
unitPriceStr, _ := unitPrices[0].Text()
unitPriceStr = strings.TrimSpace(unitPriceStr)
unitPriceStr = strings.ReplaceAll(unitPriceStr, "(", "")
unitPriceStr = strings.ReplaceAll(unitPriceStr, ")", "")
unitPriceStr = strings.ReplaceAll(unitPriceStr, "$", "")
unitPriceStr = strings.ReplaceAll(unitPriceStr, ",", "")
units := strings.Split(unitPriceStr, "/")
if len(units) > 1 {
res.Unit = strings.TrimSpace(units[1])
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
}
}
return res, nil
}