added archive, megamillions, and powerball site logic

This commit is contained in:
Steve Dudenhoeffer 2024-12-23 03:18:50 -05:00
parent 5e924eb3f9
commit 567a9f9212
19 changed files with 1412 additions and 118 deletions

View File

@ -5,8 +5,12 @@ import (
"io" "io"
) )
type OpenPageOptions struct {
Referer string
}
type Browser interface { type Browser interface {
io.Closer io.Closer
Open(ctx context.Context, url string) (Document, error) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error)
} }

79
cmd/browser/main.go Normal file
View File

@ -0,0 +1,79 @@
package main
import (
"context"
"fmt"
"io"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
)
func deferClose(cl io.Closer) {
_ = cl.Close()
}
func main() {
cmd := &cli.Command{
Name: "browser",
Flags: browser.Flags,
Usage: "<url>",
Action: func(ctx context.Context, cli *cli.Command) error {
target := cli.Args().First()
if target == "" {
return fmt.Errorf("no url specified")
}
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
defer deferClose(b)
// now open the user specified url
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
if err != nil {
return err
}
defer deferClose(doc)
article, err := extractor.Readability(ctx, doc)
if err != nil {
return err
}
content := ""
if article.Content != "" {
if len(article.Content) > 32 {
content = article.Content[:32] + "..."
} else {
content = article.Content
}
}
fmt.Println("Title:", article.Title)
fmt.Println("Byline:", article.Byline)
fmt.Println("Site:", article.SiteName)
fmt.Println("Published:", article.PublishedTime)
fmt.Println("Excerpt:", article.Excerpt)
fmt.Println("Length:", article.Length)
fmt.Println("Lang:", article.Lang)
fmt.Println("Content:", content)
fmt.Println("TextContent:", article.TextContent)
return nil
},
}
err := cmd.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,76 @@
package browser
import (
"context"
"time"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type BrowserFlags []cli.Flag
var Flags = BrowserFlags{
&cli.StringFlag{
Name: "user-agent",
Aliases: []string{"ua"},
Usage: "User-Agent to use for requests",
DefaultText: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
},
&cli.StringFlag{
Name: "timeout",
Aliases: []string{"t"},
Usage: "Timeout for requests",
DefaultText: "30s",
},
&cli.StringFlag{
Name: "browser",
Aliases: []string{"b"},
Usage: "Browser to use, one of: chromium, firefox, webkit",
DefaultText: "firefox",
},
&cli.StringFlag{
Name: "cookies-file",
Aliases: []string{"c"},
Usage: "cookies.txt file to load cookies from",
DefaultText: "",
},
&cli.BoolFlag{
Name: "visible",
Usage: "If set, the browser will be visible, if not set, the browser will be headless",
DefaultText: "false",
},
}
func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) {
var opts extractor.PlayWrightBrowserOptions
if ua := cmd.String("user-agent"); ua != "" {
opts.UserAgent = ua
}
if to := cmd.String("timeout"); to != "" {
d, err := time.ParseDuration(to)
if err != nil {
return nil, err
}
opts.Timeout = &d
}
if b := cmd.String("browser"); b != "" {
opts.Browser = extractor.PlayWrightBrowserSelection(b)
}
if cf := cmd.String("cookies-file"); cf != "" {
cookies, err := extractor.LoadCookiesFile(cf)
if err != nil {
return nil, err
}
opts.CookieJar = cookies
}
opts.ShowBrowser = cmd.Bool("visible")
return extractor.NewPlayWrightBrowser(opts)
}

View File

@ -1,20 +1,58 @@
package extractor package extractor
import ( import (
"net/url"
"strings"
"time" "time"
) )
type Cookie struct { type Cookie struct {
Name string Host string
Value string
Domain string
Path string Path string
Expires time.Time Expires time.Time
Secure bool Secure bool
HttpOnly bool HttpOnly bool
Name string
Value string
} }
func (c Cookie) IsTargetMatch(target string) (bool, error) {
u, err := url.Parse(target)
if err != nil {
return false, err
}
// the host of the cookie is the same as the host of the target
// if the cookie host starts with a dot, that means it matches any subdomain
if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) {
if c.Path != "" {
if !strings.HasPrefix(u.Path, c.Path) {
return false, nil
}
// if the cookie path is a prefix of the target path, then it's a match
// so now these would both match:
// cookie path: /foo
// target path: /foo/bar
// cookie path: /foo
// target path: /foosball
// because foseball is not an actual match, we need to check to see that either the path is an exact match
// or that the next character in the target path is a slash
if len(u.Path) > len(c.Path) && u.Path[len(c.Path)] != '/' {
return false, nil
}
return true, nil
}
}
return false, nil
}
type CookieJar interface { type CookieJar interface {
GetAll() ([]Cookie, error) GetAll() ([]Cookie, error)
Get(url string) ([]Cookie, error)
Set(cookie Cookie) error Set(cookie Cookie) error
Delete(cookie Cookie) error Delete(cookie Cookie) error
} }
@ -29,6 +67,10 @@ func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) {
return r.Jar.GetAll() return r.Jar.GetAll()
} }
func (r ReadOnlyCookieJar) Get(url string) ([]Cookie, error) {
return r.Jar.Get(url)
}
func (r ReadOnlyCookieJar) Set(_ Cookie) error { func (r ReadOnlyCookieJar) Set(_ Cookie) error {
return nil return nil
} }

106
cookies_txt.go Normal file
View File

@ -0,0 +1,106 @@
package extractor
import (
"bufio"
"io"
"os"
"strconv"
"strings"
"time"
)
type staticCookieJar []Cookie
// GetAll will return all cookies in the jar.
func (s *staticCookieJar) GetAll() ([]Cookie, error) {
return *s, nil
}
// Get will, given a URL, return all cookies that are valid for that URL.
func (s *staticCookieJar) Get(target string) ([]Cookie, error) {
var validCookies []Cookie
for _, cookie := range *s {
if match, err := cookie.IsTargetMatch(target); err != nil {
return nil, err
} else if match {
validCookies = append(validCookies, cookie)
}
}
return validCookies, nil
}
func (s *staticCookieJar) Set(cookie Cookie) error {
// see if the cookie already exists
for i, c := range *s {
if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path {
(*s)[i] = cookie
return nil
}
}
*s = append(*s, cookie)
return nil
}
func (s *staticCookieJar) Delete(cookie Cookie) error {
for i, c := range *s {
if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path {
*s = append((*s)[:i], (*s)[i+1:]...)
return nil
}
}
return nil
}
// LoadCookiesFile loads cookies from a file, in the format of cookies.txt.
func LoadCookiesFile(path string) (CookieJar, error) {
fp, err := os.Open(path)
if err != nil {
return nil, err
}
defer func(cl io.Closer) {
_ = cl.Close()
}(fp)
var cookies staticCookieJar
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := scanner.Text()
if line == "" {
continue
}
if line[0] == '#' {
continue
}
parts := strings.Split(line, "\t")
if len(parts) < 7 {
continue
}
expiry, err := strconv.ParseInt(parts[4], 10, 64)
if err != nil {
expiry = time.Now().Add(180 * 24 * time.Hour).Unix() // Default expiry
}
cookies = append(cookies, Cookie{
Host: parts[0],
HttpOnly: strings.ToLower(parts[1]) == "true",
Path: parts[2],
Secure: strings.ToLower(parts[3]) == "true",
Name: parts[5],
Expires: time.Unix(expiry, 0),
Value: parts[6],
})
}
return &cookies, nil
}

View File

@ -1,25 +1,27 @@
package extractor package extractor
import ( import (
"fmt"
"io" "io"
"log/slog"
"time"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type Document interface { type Document interface {
io.Closer io.Closer
Node
URL() string
Refresh() error
Content() (string, error) Content() (string, error)
Text() (string, error)
Screenshot() ([]byte, error)
Select(selector string) Documents WaitForNetworkIdle(timeout *time.Duration) error
SelectFirst(selector string) Document
ForEach(selector string, fn func(Document) error) error
} }
type document struct { type document struct {
node
pw *playwright.Playwright pw *playwright.Playwright
browser playwright.Browser browser playwright.Browser
page playwright.Page page playwright.Page
@ -35,64 +37,62 @@ func newDocument(pw *playwright.Playwright, browser playwright.Browser, page pla
} }
root2 := page.Locator("html") root2 := page.Locator("html")
return document{
res := &document{
node: node{
locator: root2,
},
pw: pw, pw: pw,
browser: browser, browser: browser,
page: page, page: page,
locator: root2,
root: root, root: root,
}, nil }
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
return res, nil
} }
func (p document) Close() error { func (d *document) Close() error {
return p.page.Close() return d.page.Close()
} }
func (p document) Content() (string, error) { func (d *document) URL() string {
return p.locator.TextContent() return d.page.URL()
} }
func (p document) Text() (string, error) { func (d *document) Content() (string, error) {
return p.locator.InnerText() return d.page.Content()
} }
func (p document) Screenshot() ([]byte, error) { func (d *document) Refresh() error {
return p.locator.Screenshot() resp, err := d.page.Reload()
}
func (d document) Select(selector string) Documents {
elements, err := d.locator.Locator(selector).All()
if err != nil { if err != nil {
return nil return fmt.Errorf("failed to reload page: %w", err)
} }
res := make(Documents, len(elements)) if resp.Status() != 200 {
for i, el := range elements { return fmt.Errorf("invalid status code: %d", resp.Status())
res[i] = document{
pw: d.pw,
browser: d.browser,
page: d.page,
locator: el,
}
}
return res
}
func (d document) SelectFirst(selector string) Document {
return d.Select(selector)[0]
}
func (d document) ForEach(selector string, fn func(Document) error) error {
e := d.Select(selector)
for _, el := range e {
err := fn(el)
if err != nil {
return err
}
} }
return nil return nil
} }
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
var f *float64 = nil
if timeout == nil {
t := 30 * time.Second
timeout = &t
}
if timeout != nil {
ms := float64(timeout.Milliseconds())
f = &ms
}
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: f,
})
return err
}

View File

@ -1,32 +0,0 @@
package extractor
type Documents []Document
func (d Documents) Select(selector string) Documents {
var res Documents
for _, doc := range d {
res = append(res, doc.Select(selector)...)
}
return res
}
func (d Documents) First() Document {
return d[0]
}
func (d Documents) ExtractText() ([]string, error) {
var res []string
for _, doc := range d {
text, err := doc.Text()
if err != nil {
return nil, err
}
res = append(res, text)
}
return res, nil
}

View File

@ -1,7 +0,0 @@
package extractor
import "context"
type Extractor interface {
Extract(ctx context.Context, src Source) (Article, error)
}

1
go.mod
View File

@ -15,6 +15,7 @@ require (
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-stack/stack v1.8.1 // indirect github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/urfave/cli/v3 v3.0.0-beta1 // indirect
golang.org/x/net v0.32.0 // indirect golang.org/x/net v0.32.0 // indirect
golang.org/x/text v0.21.0 // indirect golang.org/x/text v0.21.0 // indirect
) )

81
node.go Normal file
View File

@ -0,0 +1,81 @@
package extractor
import (
"github.com/playwright-community/playwright-go"
)
type Node interface {
Content() (string, error)
Text() (string, error)
Attr(name string) (string, error)
Screenshot() ([]byte, error)
Type(input string) error
Click() error
Select(selector string) Nodes
SelectFirst(selector string) Node
ForEach(selector string, fn func(Node) error) error
}
type node struct {
locator playwright.Locator
}
func (n node) Type(input string) error {
return n.locator.Type(input)
}
func (n node) Click() error {
return n.locator.Click()
}
func (n node) Content() (string, error) {
return n.locator.TextContent()
}
func (n node) Text() (string, error) {
return n.locator.InnerText()
}
func (n node) Attr(name string) (string, error) {
return n.locator.GetAttribute(name)
}
func (n node) Screenshot() ([]byte, error) {
return n.locator.Screenshot()
}
func (n node) Select(selector string) Nodes {
elements, err := n.locator.Locator(selector).All()
if err != nil {
return nil
}
var nodes Nodes
for _, element := range elements {
nodes = append(nodes, node{locator: element})
}
return nodes
}
func (n node) SelectFirst(selector string) Node {
return n.Select(selector).First()
}
func (n node) ForEach(selector string, fn func(Node) error) error {
elements, err := n.locator.Locator(selector).All()
if err != nil {
return err
}
for _, element := range elements {
if err := fn(node{locator: element}); err != nil {
return err
}
}
return nil
}

32
nodes.go Normal file
View File

@ -0,0 +1,32 @@
package extractor
type Nodes []Node
func (n Nodes) Select(selector string) Nodes {
var res Nodes
for _, node := range n {
res = append(res, node.Select(selector)...)
}
return res
}
func (d Nodes) First() Node {
return d[0]
}
func (d Nodes) ExtractText() ([]string, error) {
var res []string
for _, doc := range d {
text, err := doc.Text()
if err != nil {
return nil, err
}
res = append(res, text)
}
return res, nil
}

View File

@ -25,6 +25,7 @@ type PlayWrightBrowserSelection string
var ( var (
ErrInvalidBrowserSelection = errors.New("invalid browser selection") ErrInvalidBrowserSelection = errors.New("invalid browser selection")
ErrPageNotFound = errors.New("page not found")
ErrInvalidStatusCode = errors.New("invalid status code") ErrInvalidStatusCode = errors.New("invalid status code")
) )
@ -35,20 +36,22 @@ const (
) )
type PlayWrightBrowserOptions struct { type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
Browser PlayWrightBrowserSelection // If unset defaults to Firefox. Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
// browser into the cookie jar for each request. // browser into the cookie jar for each request.
CookieJar CookieJar
ShowBrowser bool // If false, browser will be headless
} }
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
return playwright.OptionalCookie{ return playwright.OptionalCookie{
Name: cookie.Name, Name: cookie.Name,
Value: cookie.Value, Value: cookie.Value,
Domain: playwright.String(cookie.Domain), Domain: playwright.String(cookie.Host),
Path: playwright.String(cookie.Path), Path: playwright.String(cookie.Path),
Expires: playwright.Float(float64(cookie.Expires.Unix())), Expires: playwright.Float(float64(cookie.Expires.Unix())),
HttpOnly: playwright.Bool(cookie.HttpOnly), HttpOnly: playwright.Bool(cookie.HttpOnly),
@ -59,7 +62,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
return Cookie{ return Cookie{
Name: cookie.Name, Name: cookie.Name,
Value: cookie.Value, Value: cookie.Value,
Domain: cookie.Domain, Host: cookie.Domain,
Path: cookie.Path, Path: cookie.Path,
Expires: time.Unix(int64(cookie.Expires), 0), Expires: time.Unix(int64(cookie.Expires), 0),
HttpOnly: cookie.HttpOnly, HttpOnly: cookie.HttpOnly,
@ -69,7 +72,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
var thirtySeconds = 30 * time.Second var thirtySeconds = 30 * time.Second
opt := PlayWrightBrowserOptions{ opt := PlayWrightBrowserOptions{
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
Browser: PlayWrightBrowserSelectionFirefox, Browser: PlayWrightBrowserSelectionFirefox,
Timeout: &thirtySeconds, Timeout: &thirtySeconds,
} }
@ -87,16 +90,23 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
if o.CookieJar != nil { if o.CookieJar != nil {
opt.CookieJar = o.CookieJar opt.CookieJar = o.CookieJar
} }
} opt.ShowBrowser = o.ShowBrowser
err := playwright.Install()
if err != nil {
return nil, err
} }
pw, err := playwright.Run() pw, err := playwright.Run()
if err != nil { if err != nil {
return nil, err err = playwright.Install()
if err != nil {
return nil, err
}
pw, err = playwright.Run()
if err != nil {
return nil, err
}
} }
var bt playwright.BrowserType var bt playwright.BrowserType
@ -116,7 +126,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
} }
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(true), Headless: playwright.Bool(!opt.ShowBrowser),
}) })
if err != nil { if err != nil {
return nil, err return nil, err
@ -175,21 +185,26 @@ func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page
return nil return nil
} }
func (b playWrightBrowser) openPage(_ context.Context, target string) (playwright.Page, error) { func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) {
page, err := b.ctx.NewPage() page, err := b.ctx.NewPage()
if err != nil { if err != nil {
return nil, err return nil, err
} }
opts := playwright.PageGotoOptions{ pwOpts := playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
} }
if b.timeout > 0 { if b.timeout > 0 {
var ms = float64(b.timeout.Milliseconds()) var ms = float64(b.timeout.Milliseconds())
opts.Timeout = &ms pwOpts.Timeout = &ms
} }
resp, err := page.Goto(target, opts)
if opts.Referer != "" {
pwOpts.Referer = playwright.String(opts.Referer)
}
resp, err := page.Goto(target, pwOpts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -197,6 +212,14 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
if resp.Status() != 200 { if resp.Status() != 200 {
time.Sleep(999 * time.Hour * 24)
time.Sleep(25 * time.Second)
_ = page.Close()
if resp.Status() == 404 {
return nil, ErrPageNotFound
}
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status())
} }
@ -204,13 +227,12 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
return page, nil return page, nil
} }
func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) { func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) {
page, err := b.openPage(ctx, url) page, err := b.openPage(ctx, url, opts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer page.Close()
err = b.updateCookies(ctx, page) err = b.updateCookies(ctx, page)
if err != nil { if err != nil {

View File

@ -1,25 +1,26 @@
package extractor package extractor
import ( import (
"bytes"
"context" "context"
"net/url" "net/url"
"github.com/go-shiori/go-readability" "github.com/go-shiori/go-readability"
) )
type Readability struct { func Readability(_ context.Context, doc Document) (Article, error) {
Extractor data, err := doc.Content()
} if err != nil {
return Article{}, err
}
var _ Extractor = Readability{} u, err := url.Parse(doc.URL())
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
u, err := url.Parse(src.URL())
if err != nil { if err != nil {
return Article{}, err return Article{}, err
} }
a, err := readability.FromReader(src.Reader(), u)
a, err := readability.FromReader(bytes.NewBufferString(data), u)
if err != nil { if err != nil {
return Article{}, err return Article{}, err

172
sites/archive/archive.go Normal file
View File

@ -0,0 +1,172 @@
package archive
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/url"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := 1 * time.Hour
c.Timeout = &def
}
if c.Endpoint == "" {
c.Endpoint = "https://archive.ph"
}
return c
}
var DefaultConfig = Config{}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
if errors.Is(err, extractor.ErrPageNotFound) {
return nil, nil
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return doc, nil
}
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil {
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
err = doc.SelectFirst("input[name='url']").Type(u.String())
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err)
}
err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err)
}
// wait for the page to load
time.Sleep(5 * time.Second)
select {
case <-ctx.Done():
fmt.Println("context already done before entering the loop:", ctx.Err())
return nil, ctx.Err()
default:
fmt.Println("context not done yet")
// Proceed with the loop
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-time.NewTicker(5 * time.Second).C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
}
fmt.Println("checking url:", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
}
return doc, doc.WaitForNetworkIdle(nil)
}
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}

View File

@ -0,0 +1,129 @@
package main
import (
"context"
"fmt"
"os"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
"github.com/urfave/cli/v3"
)
type ArchiveFlags []cli.Flag
var Flags = ArchiveFlags{
&cli.StringFlag{
Name: "endpoint",
Usage: "Archive endpoint to use",
DefaultText: "https://archive.ph",
},
&cli.StringFlag{
Name: "timeout",
Usage: "Timeout for requests",
DefaultText: "10s",
},
}
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
c := archive.DefaultConfig
if e := cmd.String("endpoint"); e != "" {
c.Endpoint = e
}
if t := cmd.String("timeout"); t != "" {
d, err := time.ParseDuration(t)
if err != nil {
panic(err)
}
c.Timeout = &d
}
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "archive",
Usage: "Archive a website",
Flags: Flags,
Action: func(ctx context.Context, cli *cli.Command) error {
target := cli.Args().First()
if target == "" {
return fmt.Errorf("usage: archive <url>")
}
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
doc, err := archive.IsArchived(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
fmt.Println("Not archived")
doc, err = archive.Archive(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
return fmt.Errorf("failed to archive")
}
}
defer func(doc extractor.Document) {
fmt.Println("Closing document", doc.URL())
err := doc.Close()
if err != nil {
fmt.Println("failed to close document", err)
}
}(doc)
fmt.Println("Archived at ", doc.URL())
article, err := extractor.Readability(ctx, doc)
if err != nil {
return err
}
fmt.Println("Title:", article.Title)
fmt.Println("Byline:", article.Byline)
fmt.Println("Site:", article.SiteName)
fmt.Println("Published:", article.PublishedTime)
fmt.Println("Excerpt:", article.Excerpt)
fmt.Println("Length:", article.Length)
fmt.Println("Lang:", article.Lang)
fmt.Println("Content:", article.Content[:32]+"...")
fmt.Println("TextContent:", article.TextContent)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,60 @@
package main
import (
"context"
"fmt"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/megamillions"
)
type MegaMillionsFlags []cli.Flag
var Flags = MegaMillionsFlags{}
func (f MegaMillionsFlags) ToConfig(_ *cli.Command) megamillions.Config {
c := megamillions.DefaultConfig
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "megamillions",
Usage: "Get MegaMillions information",
Flags: flags,
Action: func(ctx context.Context, cli *cli.Command) error {
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
if err != nil {
return err
}
fmt.Printf("Drawing: %+v\n", draw)
fmt.Printf("Next Drawing: %+v\n", next)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,252 @@
package megamillions
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"golang.org/x/text/currency"
)
type Config struct{}
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
type Drawing struct {
Date time.Time
Numbers [5]int
MegaBall int
Megaplier int
}
type NextDrawing struct {
Date string
Jackpot currency.Amount
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func netTicksToTime(t int64) time.Time {
return time.Unix(0, t*100).Add(-621355968000000000)
}
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
var drawing Drawing
// the drawdate is stored as a .net ticks value in the data-playdateticks attribute of a
// span with the id of "lastestDate"
date := doc.Select("span#lastestDate")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
txt, err := date[0].Attr("data-playdateticks")
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
ticks, err := strconv.ParseInt(txt, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse date: %w", err)
}
fmt.Println("ticks", ticks)
drawing.Date = netTicksToTime(ticks)
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
classes, err := n.Attr("class")
if err != nil {
return err
}
txt, err := n.Text()
if err != nil {
return err
}
val, err := strconv.Atoi(txt)
if err != nil {
return err
}
if strings.Contains(classes, "winNum1") {
drawing.Numbers[0] = val
return nil
}
if strings.Contains(classes, "winNum2") {
drawing.Numbers[1] = val
return nil
}
if strings.Contains(classes, "winNum3") {
drawing.Numbers[2] = val
return nil
}
if strings.Contains(classes, "winNum4") {
drawing.Numbers[3] = val
return nil
}
if strings.Contains(classes, "winNum5") {
drawing.Numbers[4] = val
return nil
}
if strings.Contains(classes, "winNumMB") {
drawing.MegaBall = val
return nil
}
return fmt.Errorf("unknown li.ball class: %s", classes)
})
if err != nil {
return nil, fmt.Errorf("failed to get numbers: %w", err)
}
megaplier := doc.Select("span.megaplier span.winNumMP")
if len(megaplier) != 1 {
return nil, fmt.Errorf("expected 1 megaplier, got %d", len(megaplier))
}
// megaplier is in the format of "2X" or "3X" etc.
txt, err = megaplier[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get megaplier: %w", err)
}
val, err := strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
if err != nil {
return nil, fmt.Errorf("failed to convert megaplier to int: %w", err)
}
drawing.Megaplier = val
return &drawing, nil
}
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
var nextDrawing NextDrawing
date := doc.Select("div.nextEstGroup span.nextDrawDate")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
var err error
nextDrawing.Date, err = date[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
jackpot := doc.Select("div.nextEstGroup span.nextEstVal")
if len(jackpot) != 1 {
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
}
txt, err := jackpot[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get jackpot: %w", err)
}
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
// make one filter to only get the numeric part of the jackpot
numericOnly := func(in string) float64 {
var out string
for _, r := range in {
if r >= '0' && r <= '9' {
out += string(r)
}
if r == '.' {
out += string(r)
}
}
val, err := strconv.ParseFloat(out, 64)
if err != nil {
return 0
}
return val
}
numeric := numericOnly(txt)
set := false
if strings.Contains(txt, "Billion") {
amt := currency.USD.Amount(numeric * 1000000000)
nextDrawing.Jackpot = amt
set = true
} else if strings.Contains(txt, "Million") {
amt := currency.USD.Amount(numeric * 1000000)
nextDrawing.Jackpot = amt
set = true
} else {
amt := currency.USD.Amount(numeric)
nextDrawing.Jackpot = amt
set = true
}
if !set {
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
}
return &nextDrawing, nil
}
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
c = c.validate()
doc, err := b.Open(ctx, "https://www.megamillions.com/", extractor.OpenPageOptions{})
if err != nil {
return nil, nil, err
}
defer deferClose(doc)
d, err := getDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
nd, err := getNextDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
return d, nd, nil
}
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
return DefaultConfig.GetCurrent(ctx, b)
}

View File

@ -0,0 +1,60 @@
package main
import (
"context"
"fmt"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/powerball"
)
type PowerballFlags []cli.Flag
var Flags = PowerballFlags{}
func (f PowerballFlags) ToConfig(_ *cli.Command) powerball.Config {
c := powerball.DefaultConfig
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "powerball",
Usage: "Get Powerball information",
Flags: flags,
Action: func(ctx context.Context, cli *cli.Command) error {
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
if err != nil {
return err
}
fmt.Printf("Drawing: %+v\n", draw)
fmt.Printf("Next Drawing: %+v\n", next)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,216 @@
package powerball
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"golang.org/x/text/currency"
)
type Config struct {
}
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
type Drawing struct {
Date time.Time
Numbers [5]int
PowerBall int
PowerPlay int
}
type NextDrawing struct {
Date string
Jackpot currency.Amount
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
var drawing Drawing
nums := doc.Select("div.game-ball-group div.white-balls")
if len(nums) != 5 {
return nil, fmt.Errorf("expected 5 white balls, got %d", len(nums))
}
for i, num := range nums {
txt, err := num.Text()
if err != nil {
return nil, fmt.Errorf("failed to get white ball %d: %w", i, err)
}
val, err := strconv.Atoi(txt)
if err != nil {
return nil, fmt.Errorf("failed to convert white ball %d to int: %w", i, err)
}
drawing.Numbers[i] = val
}
powerball := doc.Select("div.game-ball-group div.powerball")
if len(powerball) != 1 {
return nil, fmt.Errorf("expected 1 powerball, got %d", len(powerball))
}
txt, err := powerball[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get powerball: %w", err)
}
val, err := strconv.Atoi(txt)
if err != nil {
return nil, fmt.Errorf("failed to convert powerball to int: %w", err)
}
drawing.PowerBall = val
powerplay := doc.Select("span.power-play span.multiplier")
if len(powerplay) != 1 {
return nil, fmt.Errorf("expected 1 powerplay, got %d", len(powerplay))
}
// powerplay is in the format of "2X" or "3X" etc.
txt, err = powerplay[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get powerplay: %w", err)
}
val, err = strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
if err != nil {
return nil, fmt.Errorf("failed to convert powerplay to int: %w", err)
}
drawing.PowerPlay = val
return &drawing, nil
}
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
var nextDrawing NextDrawing
date := doc.Select("div.next-powerball h5.title-date")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
var err error
nextDrawing.Date, err = date[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
jackpot := doc.Select("div.next-powerball div.game-detail-group span.game-jackpot-number")
if len(jackpot) != 1 {
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
}
txt, err := jackpot[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get jackpot: %w", err)
}
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
// make one filter to only get the numeric part of the jackpot
numericOnly := func(in string) float64 {
var out string
for _, r := range in {
if r >= '0' && r <= '9' {
out += string(r)
}
if r == '.' {
out += string(r)
}
}
val, err := strconv.ParseFloat(out, 64)
if err != nil {
return 0
}
return val
}
numeric := numericOnly(txt)
set := false
if strings.Contains(txt, "Billion") {
amt := currency.USD.Amount(numeric * 1000000000)
nextDrawing.Jackpot = amt
set = true
} else if strings.Contains(txt, "Million") {
amt := currency.USD.Amount(numeric * 1000000)
nextDrawing.Jackpot = amt
set = true
} else {
amt := currency.USD.Amount(numeric)
nextDrawing.Jackpot = amt
set = true
}
if !set {
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
}
return &nextDrawing, nil
}
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
c = c.validate()
doc, err := b.Open(ctx, "https://www.powerball.com/", extractor.OpenPageOptions{})
if err != nil {
return nil, nil, err
}
defer deferClose(doc)
d, err := getDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
nd, err := getNextDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
return d, nd, nil
}
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
return DefaultConfig.GetCurrent(ctx, b)
}