diff --git a/go.mod b/go.mod index 94a389d..e2cd586 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,8 @@ go 1.23.2 require ( github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f github.com/playwright-community/playwright-go v0.4802.0 + github.com/urfave/cli/v3 v3.0.0-beta1 + golang.org/x/text v0.21.0 ) require ( @@ -15,7 +17,5 @@ require ( github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect - github.com/urfave/cli/v3 v3.0.0-beta1 // indirect golang.org/x/net v0.32.0 // indirect - golang.org/x/text v0.21.0 // indirect ) diff --git a/sites/aislegopher/aislegopher.go b/sites/aislegopher/aislegopher.go new file mode 100644 index 0000000..35d3e69 --- /dev/null +++ b/sites/aislegopher/aislegopher.go @@ -0,0 +1,71 @@ +package aislegopher + +import ( + "context" + "errors" + "fmt" + "io" + "net/url" + "strconv" + "strings" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +type Config struct { +} + +var DefaultConfig = Config{} + +var ( + ErrInvalidURL = errors.New("invalid url") +) + +type Item struct { + ID int + Name string +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} +func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) { + return DefaultConfig.GetItemFromURL(ctx, b, u) +} + +func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) { + res := Item{} + + // the url will be in the format of aislegopher.com/p/slug/id + // we need to parse the slug and id from the url + a := strings.Split(u.Path, "/") + if len(a) != 4 { + return res, ErrInvalidURL + } + + if a[1] != "p" { + return res, ErrInvalidURL + } + + if u.Host != "aislegopher.com" && u.Host != "www.aislegopher.com" { + return res, ErrInvalidURL + } + + res.ID, _ = strconv.Atoi(a[3]) + + doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{}) + defer deferClose(doc) + if err != nil { + return res, fmt.Errorf("failed to open page: %w", err) + } + + names := doc.Select("h2.h4") + + if len(names) > 0 { + res.Name, _ = names[0].Text() + } + + return res, nil +} diff --git a/sites/aislegopher/cmd/aislegopher/aislegopher.go b/sites/aislegopher/cmd/aislegopher/aislegopher.go new file mode 100644 index 0000000..5e599c3 --- /dev/null +++ b/sites/aislegopher/cmd/aislegopher/aislegopher.go @@ -0,0 +1,77 @@ +package main + +import ( + "context" + "fmt" + "io" + "net/url" + "os" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher" + "github.com/urfave/cli/v3" +) + +type AisleGopherFlags []cli.Flag + +var Flags = AisleGopherFlags{} + +func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config { + res := aislegopher.DefaultConfig + + return res +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} +func main() { + var flags []cli.Flag + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "aislegopher", + Usage: "AisleGopher is a tool for extracting data from aislegopher.com", + Flags: flags, + Action: func(ctx context.Context, c *cli.Command) error { + cfg := Flags.ToConfig(c) + + b, err := browser.FromCommand(ctx, c) + if err != nil { + return fmt.Errorf("failed to create browser: %w", err) + } + + defer deferClose(b) + + arg := c.Args().First() + + if arg == "" { + return fmt.Errorf("url is required") + } + + u, err := url.Parse(arg) + + if err != nil { + return fmt.Errorf("failed to parse url: %w", err) + } + + data, err := cfg.GetItemFromURL(ctx, b, u) + + if err != nil { + return fmt.Errorf("failed to get item from url: %w", err) + } + + fmt.Printf("Item: %+v\n", data) + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } +} diff --git a/sites/duckduckgo/cmd/duckduckgo/main.go b/sites/duckduckgo/cmd/duckduckgo/main.go index 5919a47..5fb5d2b 100644 --- a/sites/duckduckgo/cmd/duckduckgo/main.go +++ b/sites/duckduckgo/cmd/duckduckgo/main.go @@ -7,10 +7,9 @@ import ( "os" "strings" - "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" - "github.com/urfave/cli/v3" + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo" )