diff --git a/sites/archive/cmd/archive/main.go b/sites/archive/cmd/archive/main.go index 8a44897..2467659 100644 --- a/sites/archive/cmd/archive/main.go +++ b/sites/archive/cmd/archive/main.go @@ -6,12 +6,11 @@ import ( "os" "time" - "gitea.stevedudenhoeffer.com/steve/go-extractor" + "github.com/urfave/cli/v3" + "gitea.stevedudenhoeffer.com/steve/go-extractor" "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive" - - "github.com/urfave/cli/v3" ) type ArchiveFlags []cli.Flag diff --git a/sites/google/cmd/google/main.go b/sites/google/cmd/google/main.go new file mode 100644 index 0000000..11ff196 --- /dev/null +++ b/sites/google/cmd/google/main.go @@ -0,0 +1,96 @@ +package main + +import ( + "context" + "fmt" + "io" + "os" + "strings" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + + "github.com/urfave/cli/v3" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google" +) + +type GoogleFlags []cli.Flag + +var Flags = GoogleFlags{ + &cli.StringFlag{ + Name: "domain", + Aliases: []string{"d"}, + Usage: "The base domain to use", + }, + &cli.StringFlag{ + Name: "language", + Aliases: []string{"l"}, + Usage: "The language to use", + }, +} + +func (f GoogleFlags) ToConfig(_ context.Context, cmd *cli.Command) google.Config { + c := google.DefaultConfig + + if d := cmd.String("domain"); d != "" { + c.BaseURL = d + } + + if l := cmd.String("language"); l != "" { + c.Language = l + } + + return c +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} + +func main() { + var flags []cli.Flag + + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "google", + Usage: "Search Google", + Flags: flags, + Action: func(ctx context.Context, cli *cli.Command) error { + query := strings.Join(cli.Args().Slice(), " ") + + if query == "" { + return fmt.Errorf("usage: google ") + } + + b, err := browser.FromCommand(ctx, cli) + + defer deferClose(b) + + if err != nil { + return err + } + + cfg := Flags.ToConfig(ctx, cli) + + res, err := cfg.Search(ctx, b, query) + + if err != nil { + return err + } + + fmt.Println(res) + + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } +} diff --git a/sites/google/google.go b/sites/google/google.go new file mode 100644 index 0000000..dad0330 --- /dev/null +++ b/sites/google/google.go @@ -0,0 +1,144 @@ +package google + +import ( + "context" + "fmt" + "io" + "net/url" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +type Config struct { + // BaseURL is the base URL for the search engine, if empty "google.com" is used + BaseURL string + + // Language is the language to use for the search engine, if empty "en" is used + Language string + + // Country is the country to use for the search engine, if empty "us" is used + Country string +} + +var DefaultConfig = Config{ + BaseURL: "google.com", + Language: "en", + Country: "us", +} + +func (c Config) validate() Config { + if c.BaseURL == "" { + c.BaseURL = "google.com" + } + + if c.Language == "" { + c.Language = "en" + } + + if c.Country == "" { + c.Country = "us" + } + + return c +} + +type Result struct { + URL string + Title string + Description string +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} + +func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) { + c = c.validate() + + u, err := url.Parse(fmt.Sprintf("https://%s/search?q=%s", c.BaseURL, query)) + + if err != nil { + return nil, fmt.Errorf("invalid url: %w", err) + } + + if c.Language != "" { + u.Query().Set("hl", c.Language) + } + + if c.Country != "" { + country := "" + switch c.Country { + case "us": + country = "countryUS" + + case "uk": + country = "countryUK" + + case "au": + country = "countryAU" + + case "ca": + country = "countryCA" + } + + if country != "" { + u.Query().Set("cr", country) + } + } + + doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{}) + + if err != nil { + return nil, fmt.Errorf("failed to open url: %w", err) + } + + defer deferClose(doc) + + var res []Result + + err = doc.ForEach("div.g", func(s extractor.Node) error { + var u string + var title string + var desc string + + // get the first link in the div + link := s.Select("a") + + if len(link) == 0 { + return nil + } + + u, err := link[0].Attr("href") + + if err != nil { + return fmt.Errorf("failed to get link: %w", err) + } + + titles := s.Select("div > div > div a > h3") + + if len(titles) != 0 { + title, _ = titles[0].Text() + } + + descs := s.Select("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:not([class])") + + if len(descs) != 0 { + desc, _ = descs[0].Text() + } + + res = append(res, Result{ + URL: u, + Title: title, + Description: desc, + }) + return nil + }) + + return res, err +} + +func Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) { + return DefaultConfig.Search(ctx, b, query) +}