google-search/search.go

171 lines
4.0 KiB
Go
Raw Normal View History

2020-03-11 15:22:35 +11:00
package googlesearch
import (
"context"
"fmt"
"strings"
"github.com/gocolly/colly/v2"
)
2020-03-11 15:50:24 +11:00
// Result represents a single result from Google Search.
type Result struct {
2020-03-11 15:22:35 +11:00
// Rank is the order number of the search result.
2020-03-11 15:45:20 +11:00
Rank int `json:"rank"`
2020-03-11 15:22:35 +11:00
// URL of result.
2020-03-11 15:45:20 +11:00
URL string `json:"url"`
2020-03-11 15:22:35 +11:00
// Title of result.
2020-03-11 15:45:20 +11:00
Title string `json:"title"`
2020-03-11 15:22:35 +11:00
// Description of the result.
2020-03-11 15:45:20 +11:00
Description string `json:"description"`
2020-03-11 15:22:35 +11:00
}
// GoogleDomains represents localized Google homepages. The 2 letter country code is based on ISO 3166-1 alpha-2.
//
// PR's are welcome.
//
// See: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
var GoogleDomains = map[string]string{
"us": "https://www.google.com/search?q=",
"gb": "https://www.google.co.uk/search?q=",
"ru": "https://www.google.ru/search?q=",
"fr": "https://www.google.fr/search?q=",
"au": "https://www.google.com.au/search?q=",
"nz": "https://www.google.co.nz/search?q=",
}
2020-03-11 15:33:36 +11:00
// SearchOptions modifies how the Search function behaves.
2020-03-11 15:22:35 +11:00
type SearchOptions struct {
// CountryCode sets the ISO 3166-1 alpha-2 code of the localized Google Search homepage to use.
// The default is "us", which will return results from https://www.google.com.
CountryCode string
// LanguageCode sets the language code.
// Default: en
LanguageCode string
// Limit sets how many results to fetch (at maximum).
Limit int
// Start sets from what rank the new result set should return.
Start int
2020-03-11 15:34:47 +11:00
// UserAgent sets the UserAgent of the http request.
2020-03-11 15:22:35 +11:00
// Default: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
UserAgent string
}
// Search returns a list of search results from Google.
2020-03-11 15:50:24 +11:00
func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, error) {
2020-03-11 15:22:35 +11:00
c := colly.NewCollector(colly.MaxDepth(0))
if len(opts) == 0 {
opts = append(opts, SearchOptions{})
}
if opts[0].UserAgent == "" {
c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
} else {
c.UserAgent = opts[0].UserAgent
}
var lc string
if opts[0].LanguageCode == "" {
lc = "en"
} else {
lc = opts[0].LanguageCode
}
2020-03-11 15:50:24 +11:00
results := []Result{}
2020-03-11 15:22:35 +11:00
var rErr error
rank := 1
c.OnRequest(func(r *colly.Request) {
if err := ctx.Err(); err != nil {
r.Abort()
rErr = err
return
}
})
c.OnError(func(r *colly.Response, err error) {
rErr = err
})
c.OnHTML("div.g", func(e *colly.HTMLElement) {
sel := e.DOM
for i := range sel.Nodes {
2020-03-11 16:52:07 +11:00
if err := ctx.Err(); err != nil {
rErr = err
return
}
2020-03-11 15:22:35 +11:00
item := sel.Eq(i)
rDiv := item.Find("div.r")
linkHref, _ := rDiv.Find("a").Attr("href")
linkText := strings.TrimSpace(linkHref)
titleText := strings.TrimSpace(rDiv.Find("h3").Text())
sDiv := item.Find("div.s")
descText := strings.TrimSpace(sDiv.Find("span.st").Text())
if linkText != "" && linkText != "#" {
2020-03-11 15:50:24 +11:00
result := Result{
2020-03-11 15:22:35 +11:00
Rank: rank,
URL: linkText,
Title: titleText,
Description: descText,
}
results = append(results, result)
rank += 1
}
}
})
url := url(searchTerm, opts[0].CountryCode, lc, opts[0].Limit, opts[0].Start)
c.Visit(url)
if rErr != nil {
return nil, rErr
}
return results, nil
}
func url(searchTerm string, countryCode string, languageCode string, limit int, start int) string {
searchTerm = strings.Trim(searchTerm, " ")
searchTerm = strings.Replace(searchTerm, " ", "+", -1)
countryCode = strings.ToLower(countryCode)
var url string
if googleBase, found := GoogleDomains[countryCode]; found {
if start == 0 {
url = fmt.Sprintf("%s%s&hl=%s", googleBase, searchTerm, languageCode)
} else {
url = fmt.Sprintf("%s%s&hl=%s&start=%d", googleBase, searchTerm, languageCode, start)
}
} else {
if start == 0 {
url = fmt.Sprintf("%s%s&hl=%s", GoogleDomains["us"], searchTerm, languageCode)
} else {
url = fmt.Sprintf("%s%s&hl=%s&start=%d", GoogleDomains["us"], searchTerm, languageCode, start)
}
}
if limit != 0 {
url = fmt.Sprintf("%s&num=%d", url, limit)
}
return url
}