added archive, megamillions, and powerball site logic
This commit is contained in:
parent
5e924eb3f9
commit
567a9f9212
@ -5,8 +5,12 @@ import (
|
||||
"io"
|
||||
)
|
||||
|
||||
type OpenPageOptions struct {
|
||||
Referer string
|
||||
}
|
||||
|
||||
type Browser interface {
|
||||
io.Closer
|
||||
|
||||
Open(ctx context.Context, url string) (Document, error)
|
||||
Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error)
|
||||
}
|
||||
|
79
cmd/browser/main.go
Normal file
79
cmd/browser/main.go
Normal file
@ -0,0 +1,79 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}
|
||||
func main() {
|
||||
cmd := &cli.Command{
|
||||
Name: "browser",
|
||||
Flags: browser.Flags,
|
||||
Usage: "<url>",
|
||||
Action: func(ctx context.Context, cli *cli.Command) error {
|
||||
target := cli.Args().First()
|
||||
if target == "" {
|
||||
return fmt.Errorf("no url specified")
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
|
||||
// now open the user specified url
|
||||
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
content := ""
|
||||
|
||||
if article.Content != "" {
|
||||
|
||||
if len(article.Content) > 32 {
|
||||
content = article.Content[:32] + "..."
|
||||
} else {
|
||||
content = article.Content
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Title:", article.Title)
|
||||
fmt.Println("Byline:", article.Byline)
|
||||
fmt.Println("Site:", article.SiteName)
|
||||
fmt.Println("Published:", article.PublishedTime)
|
||||
fmt.Println("Excerpt:", article.Excerpt)
|
||||
fmt.Println("Length:", article.Length)
|
||||
fmt.Println("Lang:", article.Lang)
|
||||
fmt.Println("Content:", content)
|
||||
fmt.Println("TextContent:", article.TextContent)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cmd.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
76
cmd/browser/pkg/browser/flags.go
Normal file
76
cmd/browser/pkg/browser/flags.go
Normal file
@ -0,0 +1,76 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type BrowserFlags []cli.Flag
|
||||
|
||||
var Flags = BrowserFlags{
|
||||
&cli.StringFlag{
|
||||
Name: "user-agent",
|
||||
Aliases: []string{"ua"},
|
||||
Usage: "User-Agent to use for requests",
|
||||
DefaultText: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "timeout",
|
||||
Aliases: []string{"t"},
|
||||
Usage: "Timeout for requests",
|
||||
DefaultText: "30s",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "browser",
|
||||
Aliases: []string{"b"},
|
||||
Usage: "Browser to use, one of: chromium, firefox, webkit",
|
||||
DefaultText: "firefox",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "cookies-file",
|
||||
Aliases: []string{"c"},
|
||||
Usage: "cookies.txt file to load cookies from",
|
||||
DefaultText: "",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "visible",
|
||||
Usage: "If set, the browser will be visible, if not set, the browser will be headless",
|
||||
DefaultText: "false",
|
||||
},
|
||||
}
|
||||
|
||||
func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
||||
var opts extractor.PlayWrightBrowserOptions
|
||||
|
||||
if ua := cmd.String("user-agent"); ua != "" {
|
||||
opts.UserAgent = ua
|
||||
}
|
||||
|
||||
if to := cmd.String("timeout"); to != "" {
|
||||
d, err := time.ParseDuration(to)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
opts.Timeout = &d
|
||||
}
|
||||
|
||||
if b := cmd.String("browser"); b != "" {
|
||||
opts.Browser = extractor.PlayWrightBrowserSelection(b)
|
||||
}
|
||||
|
||||
if cf := cmd.String("cookies-file"); cf != "" {
|
||||
cookies, err := extractor.LoadCookiesFile(cf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
opts.CookieJar = cookies
|
||||
}
|
||||
|
||||
opts.ShowBrowser = cmd.Bool("visible")
|
||||
|
||||
return extractor.NewPlayWrightBrowser(opts)
|
||||
}
|
48
cookiejar.go
48
cookiejar.go
@ -1,20 +1,58 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Cookie struct {
|
||||
Name string
|
||||
Value string
|
||||
Domain string
|
||||
Host string
|
||||
Path string
|
||||
Expires time.Time
|
||||
Secure bool
|
||||
HttpOnly bool
|
||||
Name string
|
||||
Value string
|
||||
}
|
||||
|
||||
func (c Cookie) IsTargetMatch(target string) (bool, error) {
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// the host of the cookie is the same as the host of the target
|
||||
// if the cookie host starts with a dot, that means it matches any subdomain
|
||||
if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) {
|
||||
if c.Path != "" {
|
||||
if !strings.HasPrefix(u.Path, c.Path) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// if the cookie path is a prefix of the target path, then it's a match
|
||||
// so now these would both match:
|
||||
// cookie path: /foo
|
||||
// target path: /foo/bar
|
||||
// cookie path: /foo
|
||||
// target path: /foosball
|
||||
// because foseball is not an actual match, we need to check to see that either the path is an exact match
|
||||
// or that the next character in the target path is a slash
|
||||
|
||||
if len(u.Path) > len(c.Path) && u.Path[len(c.Path)] != '/' {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
type CookieJar interface {
|
||||
GetAll() ([]Cookie, error)
|
||||
Get(url string) ([]Cookie, error)
|
||||
Set(cookie Cookie) error
|
||||
Delete(cookie Cookie) error
|
||||
}
|
||||
@ -29,6 +67,10 @@ func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) {
|
||||
return r.Jar.GetAll()
|
||||
}
|
||||
|
||||
func (r ReadOnlyCookieJar) Get(url string) ([]Cookie, error) {
|
||||
return r.Jar.Get(url)
|
||||
}
|
||||
|
||||
func (r ReadOnlyCookieJar) Set(_ Cookie) error {
|
||||
return nil
|
||||
}
|
||||
|
106
cookies_txt.go
Normal file
106
cookies_txt.go
Normal file
@ -0,0 +1,106 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type staticCookieJar []Cookie
|
||||
|
||||
// GetAll will return all cookies in the jar.
|
||||
func (s *staticCookieJar) GetAll() ([]Cookie, error) {
|
||||
return *s, nil
|
||||
}
|
||||
|
||||
// Get will, given a URL, return all cookies that are valid for that URL.
|
||||
func (s *staticCookieJar) Get(target string) ([]Cookie, error) {
|
||||
var validCookies []Cookie
|
||||
|
||||
for _, cookie := range *s {
|
||||
if match, err := cookie.IsTargetMatch(target); err != nil {
|
||||
return nil, err
|
||||
} else if match {
|
||||
validCookies = append(validCookies, cookie)
|
||||
}
|
||||
}
|
||||
|
||||
return validCookies, nil
|
||||
}
|
||||
|
||||
func (s *staticCookieJar) Set(cookie Cookie) error {
|
||||
// see if the cookie already exists
|
||||
for i, c := range *s {
|
||||
if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path {
|
||||
(*s)[i] = cookie
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
*s = append(*s, cookie)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staticCookieJar) Delete(cookie Cookie) error {
|
||||
for i, c := range *s {
|
||||
if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path {
|
||||
*s = append((*s)[:i], (*s)[i+1:]...)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadCookiesFile loads cookies from a file, in the format of cookies.txt.
|
||||
func LoadCookiesFile(path string) (CookieJar, error) {
|
||||
fp, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}(fp)
|
||||
|
||||
var cookies staticCookieJar
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if line[0] == '#' {
|
||||
continue
|
||||
}
|
||||
|
||||
parts := strings.Split(line, "\t")
|
||||
|
||||
if len(parts) < 7 {
|
||||
continue
|
||||
}
|
||||
|
||||
expiry, err := strconv.ParseInt(parts[4], 10, 64)
|
||||
if err != nil {
|
||||
expiry = time.Now().Add(180 * 24 * time.Hour).Unix() // Default expiry
|
||||
}
|
||||
|
||||
cookies = append(cookies, Cookie{
|
||||
Host: parts[0],
|
||||
HttpOnly: strings.ToLower(parts[1]) == "true",
|
||||
Path: parts[2],
|
||||
Secure: strings.ToLower(parts[3]) == "true",
|
||||
Name: parts[5],
|
||||
Expires: time.Unix(expiry, 0),
|
||||
Value: parts[6],
|
||||
})
|
||||
}
|
||||
|
||||
return &cookies, nil
|
||||
}
|
98
document.go
98
document.go
@ -1,25 +1,27 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
type Document interface {
|
||||
io.Closer
|
||||
Node
|
||||
|
||||
URL() string
|
||||
Refresh() error
|
||||
Content() (string, error)
|
||||
Text() (string, error)
|
||||
Screenshot() ([]byte, error)
|
||||
|
||||
Select(selector string) Documents
|
||||
SelectFirst(selector string) Document
|
||||
|
||||
ForEach(selector string, fn func(Document) error) error
|
||||
WaitForNetworkIdle(timeout *time.Duration) error
|
||||
}
|
||||
|
||||
type document struct {
|
||||
node
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
page playwright.Page
|
||||
@ -35,64 +37,62 @@ func newDocument(pw *playwright.Playwright, browser playwright.Browser, page pla
|
||||
}
|
||||
|
||||
root2 := page.Locator("html")
|
||||
return document{
|
||||
|
||||
res := &document{
|
||||
node: node{
|
||||
locator: root2,
|
||||
},
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
page: page,
|
||||
locator: root2,
|
||||
root: root,
|
||||
}, nil
|
||||
}
|
||||
|
||||
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
|
||||
|
||||
return res, nil
|
||||
}
|
||||
func (p document) Close() error {
|
||||
return p.page.Close()
|
||||
func (d *document) Close() error {
|
||||
return d.page.Close()
|
||||
}
|
||||
|
||||
func (p document) Content() (string, error) {
|
||||
return p.locator.TextContent()
|
||||
func (d *document) URL() string {
|
||||
return d.page.URL()
|
||||
}
|
||||
|
||||
func (p document) Text() (string, error) {
|
||||
return p.locator.InnerText()
|
||||
func (d *document) Content() (string, error) {
|
||||
return d.page.Content()
|
||||
}
|
||||
|
||||
func (p document) Screenshot() ([]byte, error) {
|
||||
return p.locator.Screenshot()
|
||||
}
|
||||
|
||||
func (d document) Select(selector string) Documents {
|
||||
|
||||
elements, err := d.locator.Locator(selector).All()
|
||||
func (d *document) Refresh() error {
|
||||
resp, err := d.page.Reload()
|
||||
if err != nil {
|
||||
return nil
|
||||
return fmt.Errorf("failed to reload page: %w", err)
|
||||
}
|
||||
|
||||
res := make(Documents, len(elements))
|
||||
for i, el := range elements {
|
||||
res[i] = document{
|
||||
pw: d.pw,
|
||||
browser: d.browser,
|
||||
page: d.page,
|
||||
locator: el,
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d document) SelectFirst(selector string) Document {
|
||||
return d.Select(selector)[0]
|
||||
}
|
||||
|
||||
func (d document) ForEach(selector string, fn func(Document) error) error {
|
||||
|
||||
e := d.Select(selector)
|
||||
|
||||
for _, el := range e {
|
||||
err := fn(el)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if resp.Status() != 200 {
|
||||
return fmt.Errorf("invalid status code: %d", resp.Status())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
||||
|
||||
var f *float64 = nil
|
||||
if timeout == nil {
|
||||
t := 30 * time.Second
|
||||
timeout = &t
|
||||
}
|
||||
|
||||
if timeout != nil {
|
||||
ms := float64(timeout.Milliseconds())
|
||||
f = &ms
|
||||
}
|
||||
|
||||
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
Timeout: f,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
32
documents.go
32
documents.go
@ -1,32 +0,0 @@
|
||||
package extractor
|
||||
|
||||
type Documents []Document
|
||||
|
||||
func (d Documents) Select(selector string) Documents {
|
||||
var res Documents
|
||||
|
||||
for _, doc := range d {
|
||||
res = append(res, doc.Select(selector)...)
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d Documents) First() Document {
|
||||
return d[0]
|
||||
}
|
||||
|
||||
func (d Documents) ExtractText() ([]string, error) {
|
||||
var res []string
|
||||
|
||||
for _, doc := range d {
|
||||
text, err := doc.Text()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res = append(res, text)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
package extractor
|
||||
|
||||
import "context"
|
||||
|
||||
type Extractor interface {
|
||||
Extract(ctx context.Context, src Source) (Article, error)
|
||||
}
|
1
go.mod
1
go.mod
@ -15,6 +15,7 @@ require (
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1 // indirect
|
||||
golang.org/x/net v0.32.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
)
|
||||
|
81
node.go
Normal file
81
node.go
Normal file
@ -0,0 +1,81 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
type Node interface {
|
||||
Content() (string, error)
|
||||
Text() (string, error)
|
||||
Attr(name string) (string, error)
|
||||
Screenshot() ([]byte, error)
|
||||
|
||||
Type(input string) error
|
||||
Click() error
|
||||
|
||||
Select(selector string) Nodes
|
||||
SelectFirst(selector string) Node
|
||||
|
||||
ForEach(selector string, fn func(Node) error) error
|
||||
}
|
||||
|
||||
type node struct {
|
||||
locator playwright.Locator
|
||||
}
|
||||
|
||||
func (n node) Type(input string) error {
|
||||
return n.locator.Type(input)
|
||||
}
|
||||
|
||||
func (n node) Click() error {
|
||||
return n.locator.Click()
|
||||
}
|
||||
|
||||
func (n node) Content() (string, error) {
|
||||
return n.locator.TextContent()
|
||||
}
|
||||
|
||||
func (n node) Text() (string, error) {
|
||||
return n.locator.InnerText()
|
||||
}
|
||||
|
||||
func (n node) Attr(name string) (string, error) {
|
||||
return n.locator.GetAttribute(name)
|
||||
}
|
||||
|
||||
func (n node) Screenshot() ([]byte, error) {
|
||||
return n.locator.Screenshot()
|
||||
}
|
||||
|
||||
func (n node) Select(selector string) Nodes {
|
||||
elements, err := n.locator.Locator(selector).All()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var nodes Nodes
|
||||
for _, element := range elements {
|
||||
nodes = append(nodes, node{locator: element})
|
||||
}
|
||||
|
||||
return nodes
|
||||
}
|
||||
|
||||
func (n node) SelectFirst(selector string) Node {
|
||||
return n.Select(selector).First()
|
||||
}
|
||||
|
||||
func (n node) ForEach(selector string, fn func(Node) error) error {
|
||||
elements, err := n.locator.Locator(selector).All()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, element := range elements {
|
||||
if err := fn(node{locator: element}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
32
nodes.go
Normal file
32
nodes.go
Normal file
@ -0,0 +1,32 @@
|
||||
package extractor
|
||||
|
||||
type Nodes []Node
|
||||
|
||||
func (n Nodes) Select(selector string) Nodes {
|
||||
var res Nodes
|
||||
|
||||
for _, node := range n {
|
||||
res = append(res, node.Select(selector)...)
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d Nodes) First() Node {
|
||||
return d[0]
|
||||
}
|
||||
|
||||
func (d Nodes) ExtractText() ([]string, error) {
|
||||
var res []string
|
||||
|
||||
for _, doc := range d {
|
||||
text, err := doc.Text()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res = append(res, text)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
@ -25,6 +25,7 @@ type PlayWrightBrowserSelection string
|
||||
|
||||
var (
|
||||
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
||||
ErrPageNotFound = errors.New("page not found")
|
||||
ErrInvalidStatusCode = errors.New("invalid status code")
|
||||
)
|
||||
|
||||
@ -35,20 +36,22 @@ const (
|
||||
)
|
||||
|
||||
type PlayWrightBrowserOptions struct {
|
||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
|
||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
||||
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
||||
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
||||
|
||||
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
||||
// browser into the cookie jar for each request.
|
||||
CookieJar
|
||||
|
||||
ShowBrowser bool // If false, browser will be headless
|
||||
}
|
||||
|
||||
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
||||
return playwright.OptionalCookie{
|
||||
Name: cookie.Name,
|
||||
Value: cookie.Value,
|
||||
Domain: playwright.String(cookie.Domain),
|
||||
Domain: playwright.String(cookie.Host),
|
||||
Path: playwright.String(cookie.Path),
|
||||
Expires: playwright.Float(float64(cookie.Expires.Unix())),
|
||||
HttpOnly: playwright.Bool(cookie.HttpOnly),
|
||||
@ -59,7 +62,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
||||
return Cookie{
|
||||
Name: cookie.Name,
|
||||
Value: cookie.Value,
|
||||
Domain: cookie.Domain,
|
||||
Host: cookie.Domain,
|
||||
Path: cookie.Path,
|
||||
Expires: time.Unix(int64(cookie.Expires), 0),
|
||||
HttpOnly: cookie.HttpOnly,
|
||||
@ -69,7 +72,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
||||
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := PlayWrightBrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3",
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
||||
Browser: PlayWrightBrowserSelectionFirefox,
|
||||
Timeout: &thirtySeconds,
|
||||
}
|
||||
@ -87,16 +90,23 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
if o.CookieJar != nil {
|
||||
opt.CookieJar = o.CookieJar
|
||||
}
|
||||
}
|
||||
|
||||
err := playwright.Install()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
opt.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
|
||||
pw, err := playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
err = playwright.Install()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pw, err = playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
@ -116,7 +126,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
}
|
||||
|
||||
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(true),
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -175,21 +185,26 @@ func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) openPage(_ context.Context, target string) (playwright.Page, error) {
|
||||
func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
|
||||
page, err := b.ctx.NewPage()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts := playwright.PageGotoOptions{
|
||||
pwOpts := playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateLoad,
|
||||
}
|
||||
|
||||
if b.timeout > 0 {
|
||||
var ms = float64(b.timeout.Milliseconds())
|
||||
opts.Timeout = &ms
|
||||
pwOpts.Timeout = &ms
|
||||
}
|
||||
resp, err := page.Goto(target, opts)
|
||||
|
||||
if opts.Referer != "" {
|
||||
pwOpts.Referer = playwright.String(opts.Referer)
|
||||
}
|
||||
|
||||
resp, err := page.Goto(target, pwOpts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -197,6 +212,14 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
||||
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||
|
||||
if resp.Status() != 200 {
|
||||
time.Sleep(999 * time.Hour * 24)
|
||||
time.Sleep(25 * time.Second)
|
||||
|
||||
_ = page.Close()
|
||||
|
||||
if resp.Status() == 404 {
|
||||
return nil, ErrPageNotFound
|
||||
}
|
||||
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
||||
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
|
||||
}
|
||||
@ -204,13 +227,12 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
||||
return page, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) {
|
||||
func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
|
||||
|
||||
page, err := b.openPage(ctx, url)
|
||||
page, err := b.openPage(ctx, url, opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
err = b.updateCookies(ctx, page)
|
||||
if err != nil {
|
||||
|
@ -1,25 +1,26 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"net/url"
|
||||
|
||||
"github.com/go-shiori/go-readability"
|
||||
)
|
||||
|
||||
type Readability struct {
|
||||
Extractor
|
||||
}
|
||||
func Readability(_ context.Context, doc Document) (Article, error) {
|
||||
data, err := doc.Content()
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
var _ Extractor = Readability{}
|
||||
|
||||
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
|
||||
u, err := url.Parse(src.URL())
|
||||
u, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
a, err := readability.FromReader(src.Reader(), u)
|
||||
|
||||
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
|
172
sites/archive/archive.go
Normal file
172
sites/archive/archive.go
Normal file
@ -0,0 +1,172 @@
|
||||
package archive
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
|
||||
Endpoint string
|
||||
|
||||
// Timeout will, if set, cancel any Archive call after this duration.
|
||||
// If nil, the default timeout of 1 hour will be used.
|
||||
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
|
||||
}
|
||||
|
||||
// validate validates the config and sets default values if necessary.
|
||||
func (c Config) validate() Config {
|
||||
|
||||
if c.Timeout == nil {
|
||||
def := 1 * time.Hour
|
||||
c.Timeout = &def
|
||||
}
|
||||
|
||||
if c.Endpoint == "" {
|
||||
c.Endpoint = "https://archive.ph"
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
endpoint, err := url.Parse(c.Endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
uri := endpoint.JoinPath("/newest")
|
||||
|
||||
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
|
||||
|
||||
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
||||
|
||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.IsArchived(ctx, b, target)
|
||||
}
|
||||
|
||||
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
var cancel context.CancelFunc
|
||||
|
||||
if c.Timeout != nil {
|
||||
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
|
||||
slog.Info("setting timeout", "timeout", *c.Timeout)
|
||||
defer cancel()
|
||||
}
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
endpoint, err := url.Parse(c.Endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
err = doc.SelectFirst("input[name='url']").Type(u.String())
|
||||
|
||||
if err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to type url: %w", err)
|
||||
}
|
||||
|
||||
err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()
|
||||
|
||||
if err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to click submit: %w", err)
|
||||
}
|
||||
|
||||
// wait for the page to load
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
fmt.Println("context already done before entering the loop:", ctx.Err())
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
fmt.Println("context not done yet")
|
||||
// Proceed with the loop
|
||||
}
|
||||
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
||||
// the way we can tell this is happening is by checking the url of the page periodically
|
||||
// if the page path starts with /wip/ then we are still waiting
|
||||
// also periodically refresh the page just in case
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context done")
|
||||
keepGoing = false
|
||||
|
||||
case <-time.NewTicker(5 * time.Second).C:
|
||||
archivedUrl, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Println("checking url:", archivedUrl.String())
|
||||
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
||||
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
||||
keepGoing = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return doc, doc.WaitForNetworkIdle(nil)
|
||||
}
|
||||
|
||||
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.Archive(ctx, b, target)
|
||||
}
|
129
sites/archive/cmd/archive/main.go
Normal file
129
sites/archive/cmd/archive/main.go
Normal file
@ -0,0 +1,129 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
)
|
||||
|
||||
type ArchiveFlags []cli.Flag
|
||||
|
||||
var Flags = ArchiveFlags{
|
||||
&cli.StringFlag{
|
||||
Name: "endpoint",
|
||||
Usage: "Archive endpoint to use",
|
||||
DefaultText: "https://archive.ph",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "timeout",
|
||||
Usage: "Timeout for requests",
|
||||
DefaultText: "10s",
|
||||
},
|
||||
}
|
||||
|
||||
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
|
||||
c := archive.DefaultConfig
|
||||
|
||||
if e := cmd.String("endpoint"); e != "" {
|
||||
c.Endpoint = e
|
||||
}
|
||||
|
||||
if t := cmd.String("timeout"); t != "" {
|
||||
d, err := time.ParseDuration(t)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
c.Timeout = &d
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
func main() {
|
||||
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "archive",
|
||||
Usage: "Archive a website",
|
||||
Flags: Flags,
|
||||
Action: func(ctx context.Context, cli *cli.Command) error {
|
||||
|
||||
target := cli.Args().First()
|
||||
|
||||
if target == "" {
|
||||
return fmt.Errorf("usage: archive <url>")
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
doc, err := archive.IsArchived(ctx, b, target)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
fmt.Println("Not archived")
|
||||
|
||||
doc, err = archive.Archive(ctx, b, target)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
return fmt.Errorf("failed to archive")
|
||||
}
|
||||
}
|
||||
|
||||
defer func(doc extractor.Document) {
|
||||
fmt.Println("Closing document", doc.URL())
|
||||
err := doc.Close()
|
||||
if err != nil {
|
||||
fmt.Println("failed to close document", err)
|
||||
}
|
||||
}(doc)
|
||||
|
||||
fmt.Println("Archived at ", doc.URL())
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println("Title:", article.Title)
|
||||
fmt.Println("Byline:", article.Byline)
|
||||
fmt.Println("Site:", article.SiteName)
|
||||
fmt.Println("Published:", article.PublishedTime)
|
||||
fmt.Println("Excerpt:", article.Excerpt)
|
||||
fmt.Println("Length:", article.Length)
|
||||
fmt.Println("Lang:", article.Lang)
|
||||
fmt.Println("Content:", article.Content[:32]+"...")
|
||||
fmt.Println("TextContent:", article.TextContent)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
}
|
60
sites/megamillions/cmd/megamillions.go
Normal file
60
sites/megamillions/cmd/megamillions.go
Normal file
@ -0,0 +1,60 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/megamillions"
|
||||
)
|
||||
|
||||
type MegaMillionsFlags []cli.Flag
|
||||
|
||||
var Flags = MegaMillionsFlags{}
|
||||
|
||||
func (f MegaMillionsFlags) ToConfig(_ *cli.Command) megamillions.Config {
|
||||
c := megamillions.DefaultConfig
|
||||
return c
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "megamillions",
|
||||
Usage: "Get MegaMillions information",
|
||||
Flags: flags,
|
||||
|
||||
Action: func(ctx context.Context, cli *cli.Command) error {
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Drawing: %+v\n", draw)
|
||||
fmt.Printf("Next Drawing: %+v\n", next)
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
}
|
252
sites/megamillions/megamillions.go
Normal file
252
sites/megamillions/megamillions.go
Normal file
@ -0,0 +1,252 @@
|
||||
package megamillions
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
|
||||
"golang.org/x/text/currency"
|
||||
)
|
||||
|
||||
type Config struct{}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func (c Config) validate() Config {
|
||||
return c
|
||||
}
|
||||
|
||||
type Drawing struct {
|
||||
Date time.Time
|
||||
Numbers [5]int
|
||||
MegaBall int
|
||||
Megaplier int
|
||||
}
|
||||
|
||||
type NextDrawing struct {
|
||||
Date string
|
||||
Jackpot currency.Amount
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func netTicksToTime(t int64) time.Time {
|
||||
return time.Unix(0, t*100).Add(-621355968000000000)
|
||||
}
|
||||
|
||||
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||
var drawing Drawing
|
||||
|
||||
// the drawdate is stored as a .net ticks value in the data-playdateticks attribute of a
|
||||
// span with the id of "lastestDate"
|
||||
|
||||
date := doc.Select("span#lastestDate")
|
||||
if len(date) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
|
||||
}
|
||||
|
||||
txt, err := date[0].Attr("data-playdateticks")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get date: %w", err)
|
||||
}
|
||||
|
||||
ticks, err := strconv.ParseInt(txt, 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse date: %w", err)
|
||||
}
|
||||
|
||||
fmt.Println("ticks", ticks)
|
||||
drawing.Date = netTicksToTime(ticks)
|
||||
|
||||
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
|
||||
classes, err := n.Attr("class")
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
txt, err := n.Text()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
val, err := strconv.Atoi(txt)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNum1") {
|
||||
drawing.Numbers[0] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNum2") {
|
||||
drawing.Numbers[1] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNum3") {
|
||||
drawing.Numbers[2] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNum4") {
|
||||
drawing.Numbers[3] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNum5") {
|
||||
drawing.Numbers[4] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
if strings.Contains(classes, "winNumMB") {
|
||||
drawing.MegaBall = val
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("unknown li.ball class: %s", classes)
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get numbers: %w", err)
|
||||
}
|
||||
|
||||
megaplier := doc.Select("span.megaplier span.winNumMP")
|
||||
|
||||
if len(megaplier) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 megaplier, got %d", len(megaplier))
|
||||
}
|
||||
|
||||
// megaplier is in the format of "2X" or "3X" etc.
|
||||
|
||||
txt, err = megaplier[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get megaplier: %w", err)
|
||||
}
|
||||
|
||||
val, err := strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to convert megaplier to int: %w", err)
|
||||
}
|
||||
drawing.Megaplier = val
|
||||
|
||||
return &drawing, nil
|
||||
}
|
||||
|
||||
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
|
||||
var nextDrawing NextDrawing
|
||||
|
||||
date := doc.Select("div.nextEstGroup span.nextDrawDate")
|
||||
if len(date) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
|
||||
}
|
||||
|
||||
var err error
|
||||
nextDrawing.Date, err = date[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get date: %w", err)
|
||||
}
|
||||
|
||||
jackpot := doc.Select("div.nextEstGroup span.nextEstVal")
|
||||
|
||||
if len(jackpot) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
|
||||
}
|
||||
|
||||
txt, err := jackpot[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get jackpot: %w", err)
|
||||
}
|
||||
|
||||
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
|
||||
|
||||
// make one filter to only get the numeric part of the jackpot
|
||||
|
||||
numericOnly := func(in string) float64 {
|
||||
var out string
|
||||
for _, r := range in {
|
||||
if r >= '0' && r <= '9' {
|
||||
out += string(r)
|
||||
}
|
||||
|
||||
if r == '.' {
|
||||
out += string(r)
|
||||
}
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(out, 64)
|
||||
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return val
|
||||
}
|
||||
|
||||
numeric := numericOnly(txt)
|
||||
|
||||
set := false
|
||||
if strings.Contains(txt, "Billion") {
|
||||
amt := currency.USD.Amount(numeric * 1000000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
} else if strings.Contains(txt, "Million") {
|
||||
amt := currency.USD.Amount(numeric * 1000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
} else {
|
||||
amt := currency.USD.Amount(numeric)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
}
|
||||
|
||||
if !set {
|
||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
||||
}
|
||||
|
||||
return &nextDrawing, nil
|
||||
}
|
||||
|
||||
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
|
||||
c = c.validate()
|
||||
|
||||
doc, err := b.Open(ctx, "https://www.megamillions.com/", extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
nd, err := getNextDrawing(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return d, nd, nil
|
||||
}
|
||||
|
||||
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
|
||||
return DefaultConfig.GetCurrent(ctx, b)
|
||||
}
|
60
sites/powerball/cmd/powerball.go
Normal file
60
sites/powerball/cmd/powerball.go
Normal file
@ -0,0 +1,60 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/powerball"
|
||||
)
|
||||
|
||||
type PowerballFlags []cli.Flag
|
||||
|
||||
var Flags = PowerballFlags{}
|
||||
|
||||
func (f PowerballFlags) ToConfig(_ *cli.Command) powerball.Config {
|
||||
c := powerball.DefaultConfig
|
||||
return c
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "powerball",
|
||||
Usage: "Get Powerball information",
|
||||
Flags: flags,
|
||||
|
||||
Action: func(ctx context.Context, cli *cli.Command) error {
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Drawing: %+v\n", draw)
|
||||
fmt.Printf("Next Drawing: %+v\n", next)
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
}
|
216
sites/powerball/powerball.go
Normal file
216
sites/powerball/powerball.go
Normal file
@ -0,0 +1,216 @@
|
||||
package powerball
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
|
||||
"golang.org/x/text/currency"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func (c Config) validate() Config {
|
||||
return c
|
||||
}
|
||||
|
||||
type Drawing struct {
|
||||
Date time.Time
|
||||
Numbers [5]int
|
||||
PowerBall int
|
||||
PowerPlay int
|
||||
}
|
||||
|
||||
type NextDrawing struct {
|
||||
Date string
|
||||
Jackpot currency.Amount
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||
var drawing Drawing
|
||||
|
||||
nums := doc.Select("div.game-ball-group div.white-balls")
|
||||
|
||||
if len(nums) != 5 {
|
||||
return nil, fmt.Errorf("expected 5 white balls, got %d", len(nums))
|
||||
}
|
||||
|
||||
for i, num := range nums {
|
||||
txt, err := num.Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get white ball %d: %w", i, err)
|
||||
}
|
||||
|
||||
val, err := strconv.Atoi(txt)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to convert white ball %d to int: %w", i, err)
|
||||
}
|
||||
drawing.Numbers[i] = val
|
||||
}
|
||||
|
||||
powerball := doc.Select("div.game-ball-group div.powerball")
|
||||
|
||||
if len(powerball) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 powerball, got %d", len(powerball))
|
||||
}
|
||||
|
||||
txt, err := powerball[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get powerball: %w", err)
|
||||
}
|
||||
|
||||
val, err := strconv.Atoi(txt)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to convert powerball to int: %w", err)
|
||||
}
|
||||
|
||||
drawing.PowerBall = val
|
||||
|
||||
powerplay := doc.Select("span.power-play span.multiplier")
|
||||
|
||||
if len(powerplay) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 powerplay, got %d", len(powerplay))
|
||||
}
|
||||
|
||||
// powerplay is in the format of "2X" or "3X" etc.
|
||||
|
||||
txt, err = powerplay[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get powerplay: %w", err)
|
||||
}
|
||||
|
||||
val, err = strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to convert powerplay to int: %w", err)
|
||||
}
|
||||
drawing.PowerPlay = val
|
||||
|
||||
return &drawing, nil
|
||||
}
|
||||
|
||||
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
|
||||
var nextDrawing NextDrawing
|
||||
|
||||
date := doc.Select("div.next-powerball h5.title-date")
|
||||
|
||||
if len(date) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
|
||||
}
|
||||
|
||||
var err error
|
||||
nextDrawing.Date, err = date[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get date: %w", err)
|
||||
}
|
||||
|
||||
jackpot := doc.Select("div.next-powerball div.game-detail-group span.game-jackpot-number")
|
||||
|
||||
if len(jackpot) != 1 {
|
||||
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
|
||||
}
|
||||
|
||||
txt, err := jackpot[0].Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get jackpot: %w", err)
|
||||
}
|
||||
|
||||
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
|
||||
|
||||
// make one filter to only get the numeric part of the jackpot
|
||||
|
||||
numericOnly := func(in string) float64 {
|
||||
var out string
|
||||
for _, r := range in {
|
||||
if r >= '0' && r <= '9' {
|
||||
out += string(r)
|
||||
}
|
||||
|
||||
if r == '.' {
|
||||
out += string(r)
|
||||
}
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(out, 64)
|
||||
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return val
|
||||
}
|
||||
|
||||
numeric := numericOnly(txt)
|
||||
|
||||
set := false
|
||||
if strings.Contains(txt, "Billion") {
|
||||
amt := currency.USD.Amount(numeric * 1000000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
} else if strings.Contains(txt, "Million") {
|
||||
amt := currency.USD.Amount(numeric * 1000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
} else {
|
||||
amt := currency.USD.Amount(numeric)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
}
|
||||
|
||||
if !set {
|
||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
||||
}
|
||||
|
||||
return &nextDrawing, nil
|
||||
}
|
||||
|
||||
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
|
||||
c = c.validate()
|
||||
|
||||
doc, err := b.Open(ctx, "https://www.powerball.com/", extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
nd, err := getNextDrawing(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return d, nd, nil
|
||||
}
|
||||
|
||||
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
|
||||
return DefaultConfig.GetCurrent(ctx, b)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user