From 76da88ef38c85282452ea9e8fb7b72fa73fc3072 Mon Sep 17 00:00:00 2001 From: Miguel Pinto Date: Sat, 5 Nov 2022 22:05:34 +0000 Subject: [PATCH 1/2] Added support to follow to next page --- search.go | 70 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/search.go b/search.go index f0ffcbb..34e6d2b 100644 --- a/search.go +++ b/search.go @@ -6,9 +6,13 @@ import ( "context" "fmt" "strings" + "strconv" "github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2/proxy" + "github.com/gocolly/colly/v2/queue" + + "net/url" ) // Result represents a single result from Google Search. @@ -260,6 +264,9 @@ type SearchOptions struct { // ProxyAddr sets a proxy address to avoid IP blocking. ProxyAddr string + + // follow links + FollowLinks bool } // Search returns a list of search results from Google. @@ -290,7 +297,18 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re lc = opts[0].LanguageCode } + q, _ := queue.New( + 2, + &queue.InMemoryQueueStorage{MaxSize: 10000}, + ) + + limit := opts[0].Limit + if opts[0].OverLimit { + limit = int(float64(opts[0].Limit) * 1.5) + } + results := []Result{} + nextPageLink := "" var rErr error rank := 1 @@ -300,6 +318,12 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re rErr = err return } + if opts[0].FollowLinks == true && nextPageLink != "" { + req, err := r.New("GET", nextPageLink, nil) + if err == nil { + q.AddRequest(req) + } + } }) c.OnError(func(r *colly.Response, err error) { @@ -326,14 +350,31 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re results = append(results, result) rank += 1 } + + // check if there is a next button at the end. + // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same + nextPageHref, _ := sel.Find("a #pnnext").Attr("href") + nextPageLink = strings.TrimSpace(nextPageHref) + }) - limit := opts[0].Limit - if opts[0].OverLimit { - limit = int(float64(opts[0].Limit) * 1.5) - } + c.OnHTML("div.g", func(e *colly.HTMLElement) { - url := url(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) + sel := e.DOM + + // check if there is a next button at the end. + // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same + nextPageHref, exists := sel.Attr("href") + if exists == true { + start := getStart(strings.TrimSpace(nextPageHref)) + nextPageLink = buildUrl(searchTerm, opts[0].CountryCode, lc, limit, start) + q.AddURL(nextPageLink) + } else { + nextPageLink = "" + } + }) + + url := buildUrl(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) if opts[0].ProxyAddr != "" { rp, err := proxy.RoundRobinProxySwitcher(opts[0].ProxyAddr) @@ -343,7 +384,8 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re c.SetProxyFunc(rp) } - c.Visit(url) + q.AddURL(url) + q.Run(c) if rErr != nil { if strings.Contains(rErr.Error(), "Too Many Requests") { @@ -356,10 +398,22 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re if opts[0].Limit != 0 && len(results) > opts[0].Limit { return results[:opts[0].Limit], nil } - + return results, nil } +func getStart(uri string) int { + u, err := url.Parse(uri) + if err != nil { + fmt.Println(err) + } + q := u.Query() + ss := q.Get("start") + si, _ := strconv.Atoi(ss) + return si + +} + func base(url string) string { if strings.HasPrefix(url, "http") { return url @@ -368,7 +422,7 @@ func base(url string) string { } } -func url(searchTerm string, countryCode string, languageCode string, limit int, start int) string { +func buildUrl(searchTerm string, countryCode string, languageCode string, limit int, start int) string { searchTerm = strings.Trim(searchTerm, " ") searchTerm = strings.Replace(searchTerm, " ", "+", -1) countryCode = strings.ToLower(countryCode) From 4b3dd550eedcb3a381a48e99d7ea3218f75a7d79 Mon Sep 17 00:00:00 2001 From: rocketlaunchr-cto Date: Mon, 21 Nov 2022 12:32:55 +1100 Subject: [PATCH 2/2] - a few cosmetic changes --- search.go | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/search.go b/search.go index 34e6d2b..ff3b263 100644 --- a/search.go +++ b/search.go @@ -5,14 +5,13 @@ package googlesearch import ( "context" "fmt" - "strings" + "net/url" "strconv" + "strings" "github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2/proxy" "github.com/gocolly/colly/v2/queue" - - "net/url" ) // Result represents a single result from Google Search. @@ -265,8 +264,8 @@ type SearchOptions struct { // ProxyAddr sets a proxy address to avoid IP blocking. ProxyAddr string - // follow links - FollowLinks bool + // FollowNextPage, when set, scrapes subsequent result pages. + FollowNextPage bool } // Search returns a list of search results from Google. @@ -297,10 +296,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re lc = opts[0].LanguageCode } - q, _ := queue.New( - 2, - &queue.InMemoryQueueStorage{MaxSize: 10000}, - ) + q, _ := queue.New(1, &queue.InMemoryQueueStorage{MaxSize: 10000}) limit := opts[0].Limit if opts[0].OverLimit { @@ -310,6 +306,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re results := []Result{} nextPageLink := "" var rErr error + filteredRank := 1 rank := 1 c.OnRequest(func(r *colly.Request) { @@ -318,7 +315,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re rErr = err return } - if opts[0].FollowLinks == true && nextPageLink != "" { + if opts[0].FollowNextPage && nextPageLink != "" { req, err := r.New("GET", nextPageLink, nil) if err == nil { q.AddRequest(req) @@ -340,15 +337,16 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text()) descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text()) + rank += 1 if linkText != "" && linkText != "#" && titleText != "" { result := Result{ - Rank: rank, + Rank: filteredRank, URL: linkText, Title: titleText, Description: descText, } results = append(results, result) - rank += 1 + filteredRank += 1 } // check if there is a next button at the end. @@ -364,8 +362,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re // check if there is a next button at the end. // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same - nextPageHref, exists := sel.Attr("href") - if exists == true { + if nextPageHref, exists := sel.Attr("href"); exists { start := getStart(strings.TrimSpace(nextPageHref)) nextPageLink = buildUrl(searchTerm, opts[0].CountryCode, lc, limit, start) q.AddURL(nextPageLink) @@ -373,7 +370,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re nextPageLink = "" } }) - + url := buildUrl(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) if opts[0].ProxyAddr != "" { @@ -398,7 +395,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re if opts[0].Limit != 0 && len(results) > opts[0].Limit { return results[:opts[0].Limit], nil } - + return results, nil }