added archive, megamillions, and powerball site logic

This commit is contained in:
2024-12-23 03:18:50 -05:00
parent 5e924eb3f9
commit 567a9f9212
19 changed files with 1412 additions and 118 deletions

172
sites/archive/archive.go Normal file
View File

@@ -0,0 +1,172 @@
package archive
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/url"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := 1 * time.Hour
c.Timeout = &def
}
if c.Endpoint == "" {
c.Endpoint = "https://archive.ph"
}
return c
}
var DefaultConfig = Config{}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
if errors.Is(err, extractor.ErrPageNotFound) {
return nil, nil
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return doc, nil
}
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil {
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
err = doc.SelectFirst("input[name='url']").Type(u.String())
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err)
}
err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err)
}
// wait for the page to load
time.Sleep(5 * time.Second)
select {
case <-ctx.Done():
fmt.Println("context already done before entering the loop:", ctx.Err())
return nil, ctx.Err()
default:
fmt.Println("context not done yet")
// Proceed with the loop
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-time.NewTicker(5 * time.Second).C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
}
fmt.Println("checking url:", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
}
return doc, doc.WaitForNetworkIdle(nil)
}
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}

View File

@@ -0,0 +1,129 @@
package main
import (
"context"
"fmt"
"os"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
"github.com/urfave/cli/v3"
)
type ArchiveFlags []cli.Flag
var Flags = ArchiveFlags{
&cli.StringFlag{
Name: "endpoint",
Usage: "Archive endpoint to use",
DefaultText: "https://archive.ph",
},
&cli.StringFlag{
Name: "timeout",
Usage: "Timeout for requests",
DefaultText: "10s",
},
}
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
c := archive.DefaultConfig
if e := cmd.String("endpoint"); e != "" {
c.Endpoint = e
}
if t := cmd.String("timeout"); t != "" {
d, err := time.ParseDuration(t)
if err != nil {
panic(err)
}
c.Timeout = &d
}
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "archive",
Usage: "Archive a website",
Flags: Flags,
Action: func(ctx context.Context, cli *cli.Command) error {
target := cli.Args().First()
if target == "" {
return fmt.Errorf("usage: archive <url>")
}
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
doc, err := archive.IsArchived(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
fmt.Println("Not archived")
doc, err = archive.Archive(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
return fmt.Errorf("failed to archive")
}
}
defer func(doc extractor.Document) {
fmt.Println("Closing document", doc.URL())
err := doc.Close()
if err != nil {
fmt.Println("failed to close document", err)
}
}(doc)
fmt.Println("Archived at ", doc.URL())
article, err := extractor.Readability(ctx, doc)
if err != nil {
return err
}
fmt.Println("Title:", article.Title)
fmt.Println("Byline:", article.Byline)
fmt.Println("Site:", article.SiteName)
fmt.Println("Published:", article.PublishedTime)
fmt.Println("Excerpt:", article.Excerpt)
fmt.Println("Length:", article.Length)
fmt.Println("Lang:", article.Lang)
fmt.Println("Content:", article.Content[:32]+"...")
fmt.Println("TextContent:", article.TextContent)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,60 @@
package main
import (
"context"
"fmt"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/megamillions"
)
type MegaMillionsFlags []cli.Flag
var Flags = MegaMillionsFlags{}
func (f MegaMillionsFlags) ToConfig(_ *cli.Command) megamillions.Config {
c := megamillions.DefaultConfig
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "megamillions",
Usage: "Get MegaMillions information",
Flags: flags,
Action: func(ctx context.Context, cli *cli.Command) error {
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
if err != nil {
return err
}
fmt.Printf("Drawing: %+v\n", draw)
fmt.Printf("Next Drawing: %+v\n", next)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,252 @@
package megamillions
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"golang.org/x/text/currency"
)
type Config struct{}
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
type Drawing struct {
Date time.Time
Numbers [5]int
MegaBall int
Megaplier int
}
type NextDrawing struct {
Date string
Jackpot currency.Amount
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func netTicksToTime(t int64) time.Time {
return time.Unix(0, t*100).Add(-621355968000000000)
}
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
var drawing Drawing
// the drawdate is stored as a .net ticks value in the data-playdateticks attribute of a
// span with the id of "lastestDate"
date := doc.Select("span#lastestDate")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
txt, err := date[0].Attr("data-playdateticks")
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
ticks, err := strconv.ParseInt(txt, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse date: %w", err)
}
fmt.Println("ticks", ticks)
drawing.Date = netTicksToTime(ticks)
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
classes, err := n.Attr("class")
if err != nil {
return err
}
txt, err := n.Text()
if err != nil {
return err
}
val, err := strconv.Atoi(txt)
if err != nil {
return err
}
if strings.Contains(classes, "winNum1") {
drawing.Numbers[0] = val
return nil
}
if strings.Contains(classes, "winNum2") {
drawing.Numbers[1] = val
return nil
}
if strings.Contains(classes, "winNum3") {
drawing.Numbers[2] = val
return nil
}
if strings.Contains(classes, "winNum4") {
drawing.Numbers[3] = val
return nil
}
if strings.Contains(classes, "winNum5") {
drawing.Numbers[4] = val
return nil
}
if strings.Contains(classes, "winNumMB") {
drawing.MegaBall = val
return nil
}
return fmt.Errorf("unknown li.ball class: %s", classes)
})
if err != nil {
return nil, fmt.Errorf("failed to get numbers: %w", err)
}
megaplier := doc.Select("span.megaplier span.winNumMP")
if len(megaplier) != 1 {
return nil, fmt.Errorf("expected 1 megaplier, got %d", len(megaplier))
}
// megaplier is in the format of "2X" or "3X" etc.
txt, err = megaplier[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get megaplier: %w", err)
}
val, err := strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
if err != nil {
return nil, fmt.Errorf("failed to convert megaplier to int: %w", err)
}
drawing.Megaplier = val
return &drawing, nil
}
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
var nextDrawing NextDrawing
date := doc.Select("div.nextEstGroup span.nextDrawDate")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
var err error
nextDrawing.Date, err = date[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
jackpot := doc.Select("div.nextEstGroup span.nextEstVal")
if len(jackpot) != 1 {
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
}
txt, err := jackpot[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get jackpot: %w", err)
}
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
// make one filter to only get the numeric part of the jackpot
numericOnly := func(in string) float64 {
var out string
for _, r := range in {
if r >= '0' && r <= '9' {
out += string(r)
}
if r == '.' {
out += string(r)
}
}
val, err := strconv.ParseFloat(out, 64)
if err != nil {
return 0
}
return val
}
numeric := numericOnly(txt)
set := false
if strings.Contains(txt, "Billion") {
amt := currency.USD.Amount(numeric * 1000000000)
nextDrawing.Jackpot = amt
set = true
} else if strings.Contains(txt, "Million") {
amt := currency.USD.Amount(numeric * 1000000)
nextDrawing.Jackpot = amt
set = true
} else {
amt := currency.USD.Amount(numeric)
nextDrawing.Jackpot = amt
set = true
}
if !set {
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
}
return &nextDrawing, nil
}
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
c = c.validate()
doc, err := b.Open(ctx, "https://www.megamillions.com/", extractor.OpenPageOptions{})
if err != nil {
return nil, nil, err
}
defer deferClose(doc)
d, err := getDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
nd, err := getNextDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
return d, nd, nil
}
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
return DefaultConfig.GetCurrent(ctx, b)
}

View File

@@ -0,0 +1,60 @@
package main
import (
"context"
"fmt"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/powerball"
)
type PowerballFlags []cli.Flag
var Flags = PowerballFlags{}
func (f PowerballFlags) ToConfig(_ *cli.Command) powerball.Config {
c := powerball.DefaultConfig
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "powerball",
Usage: "Get Powerball information",
Flags: flags,
Action: func(ctx context.Context, cli *cli.Command) error {
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b)
if err != nil {
return err
}
fmt.Printf("Drawing: %+v\n", draw)
fmt.Printf("Next Drawing: %+v\n", next)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,216 @@
package powerball
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"golang.org/x/text/currency"
)
type Config struct {
}
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
type Drawing struct {
Date time.Time
Numbers [5]int
PowerBall int
PowerPlay int
}
type NextDrawing struct {
Date string
Jackpot currency.Amount
}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
var drawing Drawing
nums := doc.Select("div.game-ball-group div.white-balls")
if len(nums) != 5 {
return nil, fmt.Errorf("expected 5 white balls, got %d", len(nums))
}
for i, num := range nums {
txt, err := num.Text()
if err != nil {
return nil, fmt.Errorf("failed to get white ball %d: %w", i, err)
}
val, err := strconv.Atoi(txt)
if err != nil {
return nil, fmt.Errorf("failed to convert white ball %d to int: %w", i, err)
}
drawing.Numbers[i] = val
}
powerball := doc.Select("div.game-ball-group div.powerball")
if len(powerball) != 1 {
return nil, fmt.Errorf("expected 1 powerball, got %d", len(powerball))
}
txt, err := powerball[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get powerball: %w", err)
}
val, err := strconv.Atoi(txt)
if err != nil {
return nil, fmt.Errorf("failed to convert powerball to int: %w", err)
}
drawing.PowerBall = val
powerplay := doc.Select("span.power-play span.multiplier")
if len(powerplay) != 1 {
return nil, fmt.Errorf("expected 1 powerplay, got %d", len(powerplay))
}
// powerplay is in the format of "2X" or "3X" etc.
txt, err = powerplay[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get powerplay: %w", err)
}
val, err = strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", ""))
if err != nil {
return nil, fmt.Errorf("failed to convert powerplay to int: %w", err)
}
drawing.PowerPlay = val
return &drawing, nil
}
func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) {
var nextDrawing NextDrawing
date := doc.Select("div.next-powerball h5.title-date")
if len(date) != 1 {
return nil, fmt.Errorf("expected 1 date, got %d", len(date))
}
var err error
nextDrawing.Date, err = date[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get date: %w", err)
}
jackpot := doc.Select("div.next-powerball div.game-detail-group span.game-jackpot-number")
if len(jackpot) != 1 {
return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot))
}
txt, err := jackpot[0].Text()
if err != nil {
return nil, fmt.Errorf("failed to get jackpot: %w", err)
}
// jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc
// make one filter to only get the numeric part of the jackpot
numericOnly := func(in string) float64 {
var out string
for _, r := range in {
if r >= '0' && r <= '9' {
out += string(r)
}
if r == '.' {
out += string(r)
}
}
val, err := strconv.ParseFloat(out, 64)
if err != nil {
return 0
}
return val
}
numeric := numericOnly(txt)
set := false
if strings.Contains(txt, "Billion") {
amt := currency.USD.Amount(numeric * 1000000000)
nextDrawing.Jackpot = amt
set = true
} else if strings.Contains(txt, "Million") {
amt := currency.USD.Amount(numeric * 1000000)
nextDrawing.Jackpot = amt
set = true
} else {
amt := currency.USD.Amount(numeric)
nextDrawing.Jackpot = amt
set = true
}
if !set {
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
}
return &nextDrawing, nil
}
func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
c = c.validate()
doc, err := b.Open(ctx, "https://www.powerball.com/", extractor.OpenPageOptions{})
if err != nil {
return nil, nil, err
}
defer deferClose(doc)
d, err := getDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
nd, err := getNextDrawing(ctx, doc)
if err != nil {
return nil, nil, err
}
return d, nd, nil
}
func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) {
return DefaultConfig.GetCurrent(ctx, b)
}