Files
go-extractor/sites/google/google.go
Steve Dudenhoeffer a9711ce904
All checks were successful
CI / vet (pull_request) Successful in 1m10s
CI / build (pull_request) Successful in 1m21s
CI / test (pull_request) Successful in 1m28s
fix: surface parsing errors instead of silently discarding them
Return errors for required fields (ID, price) and log warnings for
optional fields (title, description, unit price) across all site
extractors instead of silently discarding them with _ =.

Closes #24
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 16:31:56 +00:00

150 lines
2.6 KiB
Go

package google
import (
"context"
"fmt"
"log/slog"
"net/url"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
// BaseURL is the base URL for the search engine, if empty "google.com" is used
BaseURL string
// Language is the language to use for the search engine, if empty "en" is used
Language string
// Country is the country to use for the search engine, if empty "us" is used
Country string
}
var DefaultConfig = Config{
BaseURL: "google.com",
Language: "en",
Country: "us",
}
func (c Config) validate() Config {
if c.BaseURL == "" {
c.BaseURL = "google.com"
}
if c.Language == "" {
c.Language = "en"
}
if c.Country == "" {
c.Country = "us"
}
return c
}
type Result struct {
URL string
Title string
Description string
}
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
c = c.validate()
u, err := url.Parse(fmt.Sprintf("https://%s/search", c.BaseURL))
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
vals := u.Query()
vals.Set("q", query)
if c.Language != "" {
vals.Set("hl", c.Language)
}
if c.Country != "" {
country := ""
switch c.Country {
case "us":
country = "countryUS"
case "uk":
country = "countryUK"
case "au":
country = "countryAU"
case "ca":
country = "countryCA"
}
if country != "" {
vals.Set("cr", country)
}
}
u.RawQuery = vals.Encode()
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("failed to open url: %w", err)
}
defer extractor.DeferClose(doc)
var res []Result
err = doc.ForEach("div.g", func(s extractor.Node) error {
var u string
var title string
var desc string
// get the first link in the div
link := s.Select("a")
if len(link) == 0 {
return nil
}
u, err := link[0].Attr("href")
if err != nil {
return fmt.Errorf("failed to get link: %w", err)
}
titles := s.Select("div > div > div a > h3")
if len(titles) != 0 {
title, err = titles[0].Text()
if err != nil {
slog.Warn("failed to get result title", "err", err)
}
}
descs := s.Select("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:not([class])")
if len(descs) != 0 {
desc, err = descs[0].Text()
if err != nil {
slog.Warn("failed to get result description", "err", err)
}
}
res = append(res, Result{
URL: u,
Title: title,
Description: desc,
})
return nil
})
return res, err
}
func Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
return DefaultConfig.Search(ctx, b, query)
}