diff --git a/search.go b/search.go index f0ffcbb..ff3b263 100644 --- a/search.go +++ b/search.go @@ -5,10 +5,13 @@ package googlesearch import ( "context" "fmt" + "net/url" + "strconv" "strings" "github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2/proxy" + "github.com/gocolly/colly/v2/queue" ) // Result represents a single result from Google Search. @@ -260,6 +263,9 @@ type SearchOptions struct { // ProxyAddr sets a proxy address to avoid IP blocking. ProxyAddr string + + // FollowNextPage, when set, scrapes subsequent result pages. + FollowNextPage bool } // Search returns a list of search results from Google. @@ -290,8 +296,17 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re lc = opts[0].LanguageCode } + q, _ := queue.New(1, &queue.InMemoryQueueStorage{MaxSize: 10000}) + + limit := opts[0].Limit + if opts[0].OverLimit { + limit = int(float64(opts[0].Limit) * 1.5) + } + results := []Result{} + nextPageLink := "" var rErr error + filteredRank := 1 rank := 1 c.OnRequest(func(r *colly.Request) { @@ -300,6 +315,12 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re rErr = err return } + if opts[0].FollowNextPage && nextPageLink != "" { + req, err := r.New("GET", nextPageLink, nil) + if err == nil { + q.AddRequest(req) + } + } }) c.OnError(func(r *colly.Response, err error) { @@ -316,24 +337,41 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text()) descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text()) + rank += 1 if linkText != "" && linkText != "#" && titleText != "" { result := Result{ - Rank: rank, + Rank: filteredRank, URL: linkText, Title: titleText, Description: descText, } results = append(results, result) - rank += 1 + filteredRank += 1 + } + + // check if there is a next button at the end. + // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same + nextPageHref, _ := sel.Find("a #pnnext").Attr("href") + nextPageLink = strings.TrimSpace(nextPageHref) + + }) + + c.OnHTML("div.g", func(e *colly.HTMLElement) { + + sel := e.DOM + + // check if there is a next button at the end. + // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same + if nextPageHref, exists := sel.Attr("href"); exists { + start := getStart(strings.TrimSpace(nextPageHref)) + nextPageLink = buildUrl(searchTerm, opts[0].CountryCode, lc, limit, start) + q.AddURL(nextPageLink) + } else { + nextPageLink = "" } }) - limit := opts[0].Limit - if opts[0].OverLimit { - limit = int(float64(opts[0].Limit) * 1.5) - } - - url := url(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) + url := buildUrl(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) if opts[0].ProxyAddr != "" { rp, err := proxy.RoundRobinProxySwitcher(opts[0].ProxyAddr) @@ -343,7 +381,8 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re c.SetProxyFunc(rp) } - c.Visit(url) + q.AddURL(url) + q.Run(c) if rErr != nil { if strings.Contains(rErr.Error(), "Too Many Requests") { @@ -360,6 +399,18 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re return results, nil } +func getStart(uri string) int { + u, err := url.Parse(uri) + if err != nil { + fmt.Println(err) + } + q := u.Query() + ss := q.Get("start") + si, _ := strconv.Atoi(ss) + return si + +} + func base(url string) string { if strings.HasPrefix(url, "http") { return url @@ -368,7 +419,7 @@ func base(url string) string { } } -func url(searchTerm string, countryCode string, languageCode string, limit int, start int) string { +func buildUrl(searchTerm string, countryCode string, languageCode string, limit int, start int) string { searchTerm = strings.Trim(searchTerm, " ") searchTerm = strings.Replace(searchTerm, " ", "+", -1) countryCode = strings.ToLower(countryCode)