From 654976de827f291087396f289888bb801e236055 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Mon, 20 Jan 2025 02:16:32 -0500 Subject: [PATCH] Add AisleGopher integration for data extraction Introduced a new package and command for extracting data from aislegopher.com, including URL parsing and item retrieval. Updated dependencies in go.mod to support the new functionality. Additionally, refined import structure in the DuckDuckGo integration. --- go.mod | 4 +- sites/aislegopher/aislegopher.go | 71 +++++++++++++++++ .../cmd/aislegopher/aislegopher.go | 77 +++++++++++++++++++ sites/duckduckgo/cmd/duckduckgo/main.go | 3 +- 4 files changed, 151 insertions(+), 4 deletions(-) create mode 100644 sites/aislegopher/aislegopher.go create mode 100644 sites/aislegopher/cmd/aislegopher/aislegopher.go diff --git a/go.mod b/go.mod index 94a389d..e2cd586 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,8 @@ go 1.23.2 require ( github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f github.com/playwright-community/playwright-go v0.4802.0 + github.com/urfave/cli/v3 v3.0.0-beta1 + golang.org/x/text v0.21.0 ) require ( @@ -15,7 +17,5 @@ require ( github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect - github.com/urfave/cli/v3 v3.0.0-beta1 // indirect golang.org/x/net v0.32.0 // indirect - golang.org/x/text v0.21.0 // indirect ) diff --git a/sites/aislegopher/aislegopher.go b/sites/aislegopher/aislegopher.go new file mode 100644 index 0000000..35d3e69 --- /dev/null +++ b/sites/aislegopher/aislegopher.go @@ -0,0 +1,71 @@ +package aislegopher + +import ( + "context" + "errors" + "fmt" + "io" + "net/url" + "strconv" + "strings" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +type Config struct { +} + +var DefaultConfig = Config{} + +var ( + ErrInvalidURL = errors.New("invalid url") +) + +type Item struct { + ID int + Name string +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} +func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) { + return DefaultConfig.GetItemFromURL(ctx, b, u) +} + +func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) { + res := Item{} + + // the url will be in the format of aislegopher.com/p/slug/id + // we need to parse the slug and id from the url + a := strings.Split(u.Path, "/") + if len(a) != 4 { + return res, ErrInvalidURL + } + + if a[1] != "p" { + return res, ErrInvalidURL + } + + if u.Host != "aislegopher.com" && u.Host != "www.aislegopher.com" { + return res, ErrInvalidURL + } + + res.ID, _ = strconv.Atoi(a[3]) + + doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{}) + defer deferClose(doc) + if err != nil { + return res, fmt.Errorf("failed to open page: %w", err) + } + + names := doc.Select("h2.h4") + + if len(names) > 0 { + res.Name, _ = names[0].Text() + } + + return res, nil +} diff --git a/sites/aislegopher/cmd/aislegopher/aislegopher.go b/sites/aislegopher/cmd/aislegopher/aislegopher.go new file mode 100644 index 0000000..5e599c3 --- /dev/null +++ b/sites/aislegopher/cmd/aislegopher/aislegopher.go @@ -0,0 +1,77 @@ +package main + +import ( + "context" + "fmt" + "io" + "net/url" + "os" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher" + "github.com/urfave/cli/v3" +) + +type AisleGopherFlags []cli.Flag + +var Flags = AisleGopherFlags{} + +func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config { + res := aislegopher.DefaultConfig + + return res +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} +func main() { + var flags []cli.Flag + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "aislegopher", + Usage: "AisleGopher is a tool for extracting data from aislegopher.com", + Flags: flags, + Action: func(ctx context.Context, c *cli.Command) error { + cfg := Flags.ToConfig(c) + + b, err := browser.FromCommand(ctx, c) + if err != nil { + return fmt.Errorf("failed to create browser: %w", err) + } + + defer deferClose(b) + + arg := c.Args().First() + + if arg == "" { + return fmt.Errorf("url is required") + } + + u, err := url.Parse(arg) + + if err != nil { + return fmt.Errorf("failed to parse url: %w", err) + } + + data, err := cfg.GetItemFromURL(ctx, b, u) + + if err != nil { + return fmt.Errorf("failed to get item from url: %w", err) + } + + fmt.Printf("Item: %+v\n", data) + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } +} diff --git a/sites/duckduckgo/cmd/duckduckgo/main.go b/sites/duckduckgo/cmd/duckduckgo/main.go index 5919a47..5fb5d2b 100644 --- a/sites/duckduckgo/cmd/duckduckgo/main.go +++ b/sites/duckduckgo/cmd/duckduckgo/main.go @@ -7,10 +7,9 @@ import ( "os" "strings" - "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" - "github.com/urfave/cli/v3" + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo" )