Add AisleGopher integration for data extraction
Introduced a new package and command for extracting data from aislegopher.com, including URL parsing and item retrieval. Updated dependencies in go.mod to support the new functionality. Additionally, refined import structure in the DuckDuckGo integration.
This commit is contained in:
parent
e8de488d2b
commit
654976de82
4
go.mod
4
go.mod
@ -5,6 +5,8 @@ go 1.23.2
|
||||
require (
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
|
||||
github.com/playwright-community/playwright-go v0.4802.0
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||
golang.org/x/text v0.21.0
|
||||
)
|
||||
|
||||
require (
|
||||
@ -15,7 +17,5 @@ require (
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1 // indirect
|
||||
golang.org/x/net v0.32.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
)
|
||||
|
71
sites/aislegopher/aislegopher.go
Normal file
71
sites/aislegopher/aislegopher.go
Normal file
@ -0,0 +1,71 @@
|
||||
package aislegopher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
var (
|
||||
ErrInvalidURL = errors.New("invalid url")
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
ID int
|
||||
Name string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
||||
}
|
||||
|
||||
func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
res := Item{}
|
||||
|
||||
// the url will be in the format of aislegopher.com/p/slug/id
|
||||
// we need to parse the slug and id from the url
|
||||
a := strings.Split(u.Path, "/")
|
||||
if len(a) != 4 {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
if a[1] != "p" {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
if u.Host != "aislegopher.com" && u.Host != "www.aislegopher.com" {
|
||||
return res, ErrInvalidURL
|
||||
}
|
||||
|
||||
res.ID, _ = strconv.Atoi(a[3])
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to open page: %w", err)
|
||||
}
|
||||
|
||||
names := doc.Select("h2.h4")
|
||||
|
||||
if len(names) > 0 {
|
||||
res.Name, _ = names[0].Text()
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
77
sites/aislegopher/cmd/aislegopher/aislegopher.go
Normal file
77
sites/aislegopher/cmd/aislegopher/aislegopher.go
Normal file
@ -0,0 +1,77 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
||||
"github.com/urfave/cli/v3"
|
||||
)
|
||||
|
||||
type AisleGopherFlags []cli.Flag
|
||||
|
||||
var Flags = AisleGopherFlags{}
|
||||
|
||||
func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
|
||||
res := aislegopher.DefaultConfig
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "aislegopher",
|
||||
Usage: "AisleGopher is a tool for extracting data from aislegopher.com",
|
||||
Flags: flags,
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
cfg := Flags.ToConfig(c)
|
||||
|
||||
b, err := browser.FromCommand(ctx, c)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
|
||||
arg := c.Args().First()
|
||||
|
||||
if arg == "" {
|
||||
return fmt.Errorf("url is required")
|
||||
}
|
||||
|
||||
u, err := url.Parse(arg)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse url: %w", err)
|
||||
}
|
||||
|
||||
data, err := cfg.GetItemFromURL(ctx, b, u)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get item from url: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Item: %+v\n", data)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
@ -7,10 +7,9 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user