added archive, megamillions, and powerball site logic
This commit is contained in:
172
sites/archive/archive.go
Normal file
172
sites/archive/archive.go
Normal file
@@ -0,0 +1,172 @@
|
||||
package archive
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
|
||||
Endpoint string
|
||||
|
||||
// Timeout will, if set, cancel any Archive call after this duration.
|
||||
// If nil, the default timeout of 1 hour will be used.
|
||||
Timeout *time.Duration // Timeout for the request, defaults to 1 hour
|
||||
}
|
||||
|
||||
// validate validates the config and sets default values if necessary.
|
||||
func (c Config) validate() Config {
|
||||
|
||||
if c.Timeout == nil {
|
||||
def := 1 * time.Hour
|
||||
c.Timeout = &def
|
||||
}
|
||||
|
||||
if c.Endpoint == "" {
|
||||
c.Endpoint = "https://archive.ph"
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
endpoint, err := url.Parse(c.Endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
uri := endpoint.JoinPath("/newest")
|
||||
|
||||
uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()
|
||||
|
||||
slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)
|
||||
|
||||
doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
if errors.Is(err, extractor.ErrPageNotFound) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.IsArchived(ctx, b, target)
|
||||
}
|
||||
|
||||
func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
var cancel context.CancelFunc
|
||||
|
||||
if c.Timeout != nil {
|
||||
ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
|
||||
slog.Info("setting timeout", "timeout", *c.Timeout)
|
||||
defer cancel()
|
||||
}
|
||||
u, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
endpoint, err := url.Parse(c.Endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid endpoint: %w", err)
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
if doc != nil {
|
||||
_ = doc.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
err = doc.SelectFirst("input[name='url']").Type(u.String())
|
||||
|
||||
if err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to type url: %w", err)
|
||||
}
|
||||
|
||||
err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()
|
||||
|
||||
if err != nil {
|
||||
_ = doc.Close()
|
||||
return nil, fmt.Errorf("failed to click submit: %w", err)
|
||||
}
|
||||
|
||||
// wait for the page to load
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
fmt.Println("context already done before entering the loop:", ctx.Err())
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
fmt.Println("context not done yet")
|
||||
// Proceed with the loop
|
||||
}
|
||||
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
||||
// the way we can tell this is happening is by checking the url of the page periodically
|
||||
// if the page path starts with /wip/ then we are still waiting
|
||||
// also periodically refresh the page just in case
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context done")
|
||||
keepGoing = false
|
||||
|
||||
case <-time.NewTicker(5 * time.Second).C:
|
||||
archivedUrl, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Println("checking url:", archivedUrl.String())
|
||||
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
||||
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
||||
keepGoing = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return doc, doc.WaitForNetworkIdle(nil)
|
||||
}
|
||||
|
||||
func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
return DefaultConfig.Archive(ctx, b, target)
|
||||
}
|
129
sites/archive/cmd/archive/main.go
Normal file
129
sites/archive/cmd/archive/main.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
)
|
||||
|
||||
type ArchiveFlags []cli.Flag
|
||||
|
||||
var Flags = ArchiveFlags{
|
||||
&cli.StringFlag{
|
||||
Name: "endpoint",
|
||||
Usage: "Archive endpoint to use",
|
||||
DefaultText: "https://archive.ph",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "timeout",
|
||||
Usage: "Timeout for requests",
|
||||
DefaultText: "10s",
|
||||
},
|
||||
}
|
||||
|
||||
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
|
||||
c := archive.DefaultConfig
|
||||
|
||||
if e := cmd.String("endpoint"); e != "" {
|
||||
c.Endpoint = e
|
||||
}
|
||||
|
||||
if t := cmd.String("timeout"); t != "" {
|
||||
d, err := time.ParseDuration(t)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
c.Timeout = &d
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
func main() {
|
||||
|
||||
var flags []cli.Flag
|
||||
|
||||
flags = append(flags, browser.Flags...)
|
||||
flags = append(flags, Flags...)
|
||||
|
||||
cli := &cli.Command{
|
||||
Name: "archive",
|
||||
Usage: "Archive a website",
|
||||
Flags: Flags,
|
||||
Action: func(ctx context.Context, cli *cli.Command) error {
|
||||
|
||||
target := cli.Args().First()
|
||||
|
||||
if target == "" {
|
||||
return fmt.Errorf("usage: archive <url>")
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
doc, err := archive.IsArchived(ctx, b, target)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
fmt.Println("Not archived")
|
||||
|
||||
doc, err = archive.Archive(ctx, b, target)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
return fmt.Errorf("failed to archive")
|
||||
}
|
||||
}
|
||||
|
||||
defer func(doc extractor.Document) {
|
||||
fmt.Println("Closing document", doc.URL())
|
||||
err := doc.Close()
|
||||
if err != nil {
|
||||
fmt.Println("failed to close document", err)
|
||||
}
|
||||
}(doc)
|
||||
|
||||
fmt.Println("Archived at ", doc.URL())
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println("Title:", article.Title)
|
||||
fmt.Println("Byline:", article.Byline)
|
||||
fmt.Println("Site:", article.SiteName)
|
||||
fmt.Println("Published:", article.PublishedTime)
|
||||
fmt.Println("Excerpt:", article.Excerpt)
|
||||
fmt.Println("Length:", article.Length)
|
||||
fmt.Println("Lang:", article.Lang)
|
||||
fmt.Println("Content:", article.Content[:32]+"...")
|
||||
fmt.Println("TextContent:", article.TextContent)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user