added archive, megamillions, and powerball site logic

This commit is contained in:
2024-12-23 03:18:50 -05:00
parent 5e924eb3f9
commit 567a9f9212
19 changed files with 1412 additions and 118 deletions

172
sites/archive/archive.go Normal file
View File

@@ -0,0 +1,172 @@
package archive
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/url"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type Config struct {
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
Endpoint string
// Timeout will, if set, cancel any Archive call after this duration.
// If nil, the default timeout of 1 hour will be used.
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}
// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {
if c.Timeout == nil {
def := 1 * time.Hour
c.Timeout = &def
}
if c.Endpoint == "" {
c.Endpoint = "https://archive.ph"
}
return c
}
var DefaultConfig = Config{}
func deferClose(cl io.Closer) {
if cl != nil {
_ = cl.Close()
}
}
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
uri := endpoint.JoinPath("/newest")
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
if errors.Is(err, extractor.ErrPageNotFound) {
return nil, nil
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
return doc, nil
}
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.IsArchived(ctx, b, target)
}
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
c = c.validate()
var cancel context.CancelFunc
if c.Timeout != nil {
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
slog.Info("setting timeout", "timeout", *c.Timeout)
defer cancel()
}
u, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
endpoint, err := url.Parse(c.Endpoint)
if err != nil {
return nil, fmt.Errorf("invalid endpoint: %w", err)
}
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
if err != nil {
if doc != nil {
_ = doc.Close()
}
return nil, fmt.Errorf("failed to open url: %w", err)
}
err = doc.SelectFirst("input[name='url']").Type(u.String())
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to type url: %w", err)
}
err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()
if err != nil {
_ = doc.Close()
return nil, fmt.Errorf("failed to click submit: %w", err)
}
// wait for the page to load
time.Sleep(5 * time.Second)
select {
case <-ctx.Done():
fmt.Println("context already done before entering the loop:", ctx.Err())
return nil, ctx.Err()
default:
fmt.Println("context not done yet")
// Proceed with the loop
}
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
// the way we can tell this is happening is by checking the url of the page periodically
// if the page path starts with /wip/ then we are still waiting
// also periodically refresh the page just in case
keepGoing := true
for keepGoing {
select {
case <-ctx.Done():
slog.Info("context done")
keepGoing = false
case <-time.NewTicker(5 * time.Second).C:
archivedUrl, err := url.Parse(doc.URL())
if err != nil {
continue
}
fmt.Println("checking url:", archivedUrl.String())
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
keepGoing = false
break
}
}
}
return doc, doc.WaitForNetworkIdle(nil)
}
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
return DefaultConfig.Archive(ctx, b, target)
}

View File

@@ -0,0 +1,129 @@
package main
import (
"context"
"fmt"
"os"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
"github.com/urfave/cli/v3"
)
type ArchiveFlags []cli.Flag
var Flags = ArchiveFlags{
&cli.StringFlag{
Name: "endpoint",
Usage: "Archive endpoint to use",
DefaultText: "https://archive.ph",
},
&cli.StringFlag{
Name: "timeout",
Usage: "Timeout for requests",
DefaultText: "10s",
},
}
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
c := archive.DefaultConfig
if e := cmd.String("endpoint"); e != "" {
c.Endpoint = e
}
if t := cmd.String("timeout"); t != "" {
d, err := time.ParseDuration(t)
if err != nil {
panic(err)
}
c.Timeout = &d
}
return c
}
func main() {
var flags []cli.Flag
flags = append(flags, browser.Flags...)
flags = append(flags, Flags...)
cli := &cli.Command{
Name: "archive",
Usage: "Archive a website",
Flags: Flags,
Action: func(ctx context.Context, cli *cli.Command) error {
target := cli.Args().First()
if target == "" {
return fmt.Errorf("usage: archive <url>")
}
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
doc, err := archive.IsArchived(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
fmt.Println("Not archived")
doc, err = archive.Archive(ctx, b, target)
if err != nil {
return err
}
if doc == nil {
return fmt.Errorf("failed to archive")
}
}
defer func(doc extractor.Document) {
fmt.Println("Closing document", doc.URL())
err := doc.Close()
if err != nil {
fmt.Println("failed to close document", err)
}
}(doc)
fmt.Println("Archived at ", doc.URL())
article, err := extractor.Readability(ctx, doc)
if err != nil {
return err
}
fmt.Println("Title:", article.Title)
fmt.Println("Byline:", article.Byline)
fmt.Println("Site:", article.SiteName)
fmt.Println("Published:", article.PublishedTime)
fmt.Println("Excerpt:", article.Excerpt)
fmt.Println("Length:", article.Length)
fmt.Println("Lang:", article.Lang)
fmt.Println("Content:", article.Content[:32]+"...")
fmt.Println("TextContent:", article.TextContent)
return nil
},
}
err := cli.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}