Compare commits
8 Commits
691ae400d1
...
langchain
Author | SHA1 | Date | |
---|---|---|---|
964a98a5a8 | |||
81ea656332 | |||
6de455b1bd | |||
f37e60dddc | |||
654976de82 | |||
e8de488d2b | |||
67a3552747 | |||
eec94ec708 |
4
go.mod
4
go.mod
@@ -5,6 +5,8 @@ go 1.23.2
|
||||
require (
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
|
||||
github.com/playwright-community/playwright-go v0.4802.0
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||
golang.org/x/text v0.21.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -15,7 +17,5 @@ require (
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1 // indirect
|
||||
golang.org/x/net v0.32.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
)
|
||||
|
81
sites/aislegopher/aislegopher.go
Normal file
81
sites/aislegopher/aislegopher.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package aislegopher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
var (
|
||||
ErrInvalidURL = errors.New("invalid url")
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
ID int
|
||||
Name string
|
||||
Price float64
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
||||
}
|
||||
|
||||
func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
res := Item{}
|
||||
|
||||
// the url will be in the format of aislegopher.com/p/slug/id
|
||||
// we need to parse the slug and id from the url
|
||||
a := strings.Split(u.Path, "/")
|
||||
if len(a) != 4 {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
if a[1] != "p" {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
if u.Host != "aislegopher.com" && u.Host != "www.aislegopher.com" {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
res.ID, _ = strconv.Atoi(a[3])
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to open page: %w", err)
|
||||
}
|
||||
|
||||
names := doc.Select("h2.h4")
|
||||
|
||||
if len(names) > 0 {
|
||||
res.Name, _ = names[0].Text()
|
||||
}
|
||||
|
||||
prices := doc.Select("h4.h2")
|
||||
|
||||
if len(prices) > 0 {
|
||||
priceStr, _ := prices[0].Text()
|
||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||
priceStr = strings.TrimSpace(priceStr)
|
||||
res.Price, _ = strconv.ParseFloat(priceStr, 64)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
77
sites/aislegopher/cmd/aislegopher/aislegopher.go
Normal file
77
sites/aislegopher/cmd/aislegopher/aislegopher.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
||||
"github.com/urfave/cli/v3"
|
||||
)
|
||||
|
||||
type AisleGopherFlags []cli.Flag
|
||||
|
||||
var Flags = AisleGopherFlags{}
|
||||
|
||||
func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
|
||||
res := aislegopher.DefaultConfig
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "aislegopher",
|
||||
Usage: "AisleGopher is a tool for extracting data from aislegopher.com",
|
||||
Flags: flags,
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
cfg := Flags.ToConfig(c)
|
||||
|
||||
b, err := browser.FromCommand(ctx, c)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
|
||||
arg := c.Args().First()
|
||||
|
||||
if arg == "" {
|
||||
return fmt.Errorf("url is required")
|
||||
}
|
||||
|
||||
u, err := url.Parse(arg)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse url: %w", err)
|
||||
}
|
||||
|
||||
data, err := cfg.GetItemFromURL(ctx, b, u)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get item from url: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Item: %+v\n", data)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
101
sites/duckduckgo/cmd/duckduckgo/main.go
Normal file
101
sites/duckduckgo/cmd/duckduckgo/main.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||
)
|
||||
|
||||
type DuckDuckGoFlags []cli.Flag
|
||||
|
||||
var Flags = DuckDuckGoFlags{
|
||||
&cli.StringFlag{
|
||||
Name: "region",
|
||||
Aliases: []string{"r"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "safesearch",
|
||||
Aliases: []string{"s"},
|
||||
},
|
||||
}
|
||||
|
||||
func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) duckduckgo.Config {
|
||||
var res = duckduckgo.DefaultConfig
|
||||
|
||||
if r := cmd.String("region"); r != "" {
|
||||
res.Region = r
|
||||
}
|
||||
|
||||
if s := cmd.String("safesearch"); s != "" {
|
||||
switch s {
|
||||
case "on":
|
||||
res.SafeSearch = duckduckgo.SafeSearchOn
|
||||
case "moderate":
|
||||
res.SafeSearch = duckduckgo.SafeSearchModerate
|
||||
case "off":
|
||||
res.SafeSearch = duckduckgo.SafeSearchOff
|
||||
default:
|
||||
panic("invalid safe search value")
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "duckduckgo",
|
||||
Usage: "Search DuckDuckGo",
|
||||
Flags: flags,
|
||||
Action: func(ctx context.Context, command *cli.Command) error {
|
||||
c := Flags.ToConfig(command)
|
||||
defer deferClose(nil)
|
||||
|
||||
query := strings.TrimSpace(strings.Join(command.Args().Slice(), " "))
|
||||
|
||||
if query == "" {
|
||||
return cli.Exit("usage: duckduckgo <query>", 1)
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, command)
|
||||
defer deferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
}
|
||||
|
||||
res, err := c.Search(ctx, b, query)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to search: %w", err)
|
||||
}
|
||||
|
||||
fmt.Println(res)
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
126
sites/duckduckgo/duckduckgo.go
Normal file
126
sites/duckduckgo/duckduckgo.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package duckduckgo
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type SafeSearch int
|
||||
|
||||
const (
|
||||
SafeSearchOn SafeSearch = 1
|
||||
SafeSearchModerate SafeSearch = -1
|
||||
SafeSearchOff SafeSearch = -2
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
// SafeSearch is the safe-search level to use. If empty, SafeSearchOff will be used.
|
||||
SafeSearch SafeSearch
|
||||
|
||||
// Region is the region to use for the search engine.
|
||||
// See: https://duckduckgo.com/duckduckgo-help-pages/settings/params/ for more values
|
||||
Region string
|
||||
}
|
||||
|
||||
func (c Config) validate() Config {
|
||||
if c.SafeSearch == 0 {
|
||||
c.SafeSearch = SafeSearchOff
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
func (c Config) ToSearchURL(query string) *url.URL {
|
||||
c = c.validate()
|
||||
|
||||
res, _ := url.Parse("https://duckduckgo.com/")
|
||||
|
||||
var vals = res.Query()
|
||||
|
||||
switch c.SafeSearch {
|
||||
case SafeSearchOn:
|
||||
vals.Set("kp", "1")
|
||||
case SafeSearchModerate:
|
||||
vals.Set("kp", "-1")
|
||||
case SafeSearchOff:
|
||||
vals.Set("kp", "-2")
|
||||
}
|
||||
|
||||
if c.Region != "" {
|
||||
vals.Set("kl", c.Region)
|
||||
}
|
||||
|
||||
vals.Set("q", query)
|
||||
|
||||
res.RawQuery = vals.Encode()
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{
|
||||
SafeSearch: SafeSearchOff,
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
URL string
|
||||
Title string
|
||||
Description string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
||||
u := c.ToSearchURL(query)
|
||||
|
||||
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
var res []Result
|
||||
|
||||
err = doc.ForEach(`article[id^="r1-"]`, func(n extractor.Node) error {
|
||||
var r Result
|
||||
|
||||
links := n.Select(`a[href][target="_self"]`)
|
||||
|
||||
if len(links) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
r.URL, err = links[0].Attr(`href`)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get link: %w", err)
|
||||
}
|
||||
|
||||
titles := n.Select("h2")
|
||||
|
||||
if len(titles) != 0 {
|
||||
r.Title, _ = titles[0].Text()
|
||||
}
|
||||
|
||||
descriptions := n.Select("span > span")
|
||||
|
||||
if len(descriptions) != 0 {
|
||||
r.Description, _ = descriptions[0].Text()
|
||||
}
|
||||
|
||||
res = append(res, r)
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
return res, nil
|
||||
}
|
@@ -7,10 +7,9 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google"
|
||||
)
|
||||
|
||||
|
81
sites/wegmans/cmd/wegmans/main.go
Normal file
81
sites/wegmans/cmd/wegmans/main.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
type WegmansFlags []cli.Flag
|
||||
|
||||
var Flags = WegmansFlags{}
|
||||
|
||||
func (f WegmansFlags) ToConfig(_ *cli.Command) wegmans.Config {
|
||||
var res = wegmans.DefaultConfig
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
app := &cli.Command{
|
||||
Name: "wegmans",
|
||||
Usage: "Search Wegmans",
|
||||
Flags: flags,
|
||||
Action: func(ctx context.Context, cmd *cli.Command) error {
|
||||
cfg := Flags.ToConfig(cmd)
|
||||
|
||||
b, err := browser.FromCommand(ctx, cmd)
|
||||
defer deferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating browser: %w", err)
|
||||
}
|
||||
arg := cmd.Args().First()
|
||||
|
||||
if arg == "" {
|
||||
return fmt.Errorf("url is required")
|
||||
}
|
||||
|
||||
u, err := url.Parse(arg)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse url: %w", err)
|
||||
}
|
||||
|
||||
item, err := cfg.GetItemPrice(ctx, b, u)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get item price: %w", err)
|
||||
}
|
||||
|
||||
fmt.Println(item)
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := app.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
118
sites/wegmans/wegmans.go
Normal file
118
sites/wegmans/wegmans.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package wegmans
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
var ErrNilBrowser = errors.New("browser is nil")
|
||||
var ErrNilURL = errors.New("url is nil")
|
||||
var ErrInvalidURL = errors.New("invalid url")
|
||||
|
||||
type Item struct {
|
||||
ID int
|
||||
Name string
|
||||
Price float64
|
||||
UnitPrice float64
|
||||
Unit string
|
||||
}
|
||||
|
||||
func deferClose(c io.Closer) {
|
||||
if c != nil {
|
||||
_ = c.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
|
||||
if b == nil {
|
||||
return Item{}, ErrNilBrowser
|
||||
}
|
||||
|
||||
if u == nil {
|
||||
return Item{}, ErrNilURL
|
||||
}
|
||||
|
||||
// urls in the format of:
|
||||
// https://shop.wegmans.com/product/24921[/wegmans-frozen-thin-crust-uncured-pepperoni-pizza]
|
||||
// (the slug is optional)
|
||||
|
||||
// get the product ID
|
||||
a := strings.Split(u.Path, "/")
|
||||
|
||||
if len(a) < 3 {
|
||||
return Item{}, ErrInvalidURL
|
||||
}
|
||||
|
||||
if a[1] != "product" {
|
||||
return Item{}, ErrInvalidURL
|
||||
}
|
||||
|
||||
id, _ := strconv.Atoi(a[2])
|
||||
|
||||
if id == 0 {
|
||||
return Item{}, ErrInvalidURL
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return Item{}, err
|
||||
}
|
||||
|
||||
timeout := 15 * time.Second
|
||||
_ = doc.WaitForNetworkIdle(&timeout)
|
||||
|
||||
res := Item{
|
||||
ID: id,
|
||||
}
|
||||
|
||||
titles := doc.Select("h1[data-test]")
|
||||
|
||||
if len(titles) != 0 {
|
||||
res.Name, _ = titles[0].Text()
|
||||
}
|
||||
|
||||
prices := doc.Select("span[data-test=\"amount\"] span:nth-child(1)")
|
||||
|
||||
if len(prices) != 0 {
|
||||
priceStr, _ := prices[0].Text()
|
||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
||||
price, _ := strconv.ParseFloat(priceStr, 64)
|
||||
res.Price = price
|
||||
}
|
||||
|
||||
unitPrices := doc.Select(`span[data-test="per-unit-price"]`)
|
||||
|
||||
if len(unitPrices) != 0 {
|
||||
unitPriceStr, _ := unitPrices[0].Text()
|
||||
unitPriceStr = strings.TrimSpace(unitPriceStr)
|
||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, "(", "")
|
||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, ")", "")
|
||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, "$", "")
|
||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, ",", "")
|
||||
|
||||
units := strings.Split(unitPriceStr, "/")
|
||||
|
||||
if len(units) > 1 {
|
||||
res.Unit = strings.TrimSpace(units[1])
|
||||
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
Reference in New Issue
Block a user