- a few cosmetic changes

This commit is contained in:
rocketlaunchr-cto 2022-11-21 12:32:55 +11:00
parent 871a28763e
commit 4b3dd550ee

View File

@ -5,14 +5,13 @@ package googlesearch
import ( import (
"context" "context"
"fmt" "fmt"
"strings" "net/url"
"strconv" "strconv"
"strings"
"github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/proxy" "github.com/gocolly/colly/v2/proxy"
"github.com/gocolly/colly/v2/queue" "github.com/gocolly/colly/v2/queue"
"net/url"
) )
// Result represents a single result from Google Search. // Result represents a single result from Google Search.
@ -265,8 +264,8 @@ type SearchOptions struct {
// ProxyAddr sets a proxy address to avoid IP blocking. // ProxyAddr sets a proxy address to avoid IP blocking.
ProxyAddr string ProxyAddr string
// follow links // FollowNextPage, when set, scrapes subsequent result pages.
FollowLinks bool FollowNextPage bool
} }
// Search returns a list of search results from Google. // Search returns a list of search results from Google.
@ -297,10 +296,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
lc = opts[0].LanguageCode lc = opts[0].LanguageCode
} }
q, _ := queue.New( q, _ := queue.New(1, &queue.InMemoryQueueStorage{MaxSize: 10000})
2,
&queue.InMemoryQueueStorage{MaxSize: 10000},
)
limit := opts[0].Limit limit := opts[0].Limit
if opts[0].OverLimit { if opts[0].OverLimit {
@ -310,6 +306,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
results := []Result{} results := []Result{}
nextPageLink := "" nextPageLink := ""
var rErr error var rErr error
filteredRank := 1
rank := 1 rank := 1
c.OnRequest(func(r *colly.Request) { c.OnRequest(func(r *colly.Request) {
@ -318,7 +315,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
rErr = err rErr = err
return return
} }
if opts[0].FollowLinks == true && nextPageLink != "" { if opts[0].FollowNextPage && nextPageLink != "" {
req, err := r.New("GET", nextPageLink, nil) req, err := r.New("GET", nextPageLink, nil)
if err == nil { if err == nil {
q.AddRequest(req) q.AddRequest(req)
@ -340,15 +337,16 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text()) titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text())
descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text()) descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text())
rank += 1
if linkText != "" && linkText != "#" && titleText != "" { if linkText != "" && linkText != "#" && titleText != "" {
result := Result{ result := Result{
Rank: rank, Rank: filteredRank,
URL: linkText, URL: linkText,
Title: titleText, Title: titleText,
Description: descText, Description: descText,
} }
results = append(results, result) results = append(results, result)
rank += 1 filteredRank += 1
} }
// check if there is a next button at the end. // check if there is a next button at the end.
@ -364,8 +362,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
// check if there is a next button at the end. // check if there is a next button at the end.
// Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same // Added this selector as the Id is the same for every language checked on google.com .pt and .es the text changes but the id remains the same
nextPageHref, exists := sel.Attr("href") if nextPageHref, exists := sel.Attr("href"); exists {
if exists == true {
start := getStart(strings.TrimSpace(nextPageHref)) start := getStart(strings.TrimSpace(nextPageHref))
nextPageLink = buildUrl(searchTerm, opts[0].CountryCode, lc, limit, start) nextPageLink = buildUrl(searchTerm, opts[0].CountryCode, lc, limit, start)
q.AddURL(nextPageLink) q.AddURL(nextPageLink)