go-extractor/sites/archive/archive.go

package archive

import (
	"context"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"net/url"
	"strings"
	"time"

	"gitea.stevedudenhoeffer.com/steve/go-extractor"
)

type Config struct {
	// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
	Endpoint string

	// Timeout will, if set, cancel any Archive call after this duration.
	// If nil, the default timeout of 1 hour will be used.
	Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}

// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {

	if c.Timeout == nil {
		def := 1 * time.Hour
		c.Timeout = &def
	}

	if c.Endpoint == "" {
		c.Endpoint = "https://archive.ph"
	}

	return c
}

var DefaultConfig = Config{}

func deferClose(cl io.Closer) {
	if cl != nil {
		_ = cl.Close()
	}
}

// IsArchived checks if a url is archived.  It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	c = c.validate()
	u, err := url.Parse(target)
	if err != nil {
		return nil, fmt.Errorf("invalid url: %w", err)
	}

	endpoint, err := url.Parse(c.Endpoint)
	if err != nil {
		return nil, fmt.Errorf("invalid endpoint: %w", err)
	}

	uri := endpoint.JoinPath("/newest")

	uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()

	slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)

	doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})

	if err != nil {
		if doc != nil {
			_ = doc.Close()
		}
		if errors.Is(err, extractor.ErrPageNotFound) {
			return nil, nil
		}
		return nil, fmt.Errorf("failed to open url: %w", err)
	}

	return doc, nil
}

func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	return DefaultConfig.IsArchived(ctx, b, target)
}

func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	c = c.validate()
	var cancel context.CancelFunc

	if c.Timeout != nil {
		ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
		slog.Info("setting timeout", "timeout", *c.Timeout)
		defer cancel()
	}
	u, err := url.Parse(target)
	if err != nil {
		return nil, fmt.Errorf("invalid url: %w", err)
	}

	endpoint, err := url.Parse(c.Endpoint)
	if err != nil {
		return nil, fmt.Errorf("invalid endpoint: %w", err)
	}

	doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})

	if err != nil {
		if doc != nil {
			_ = doc.Close()
		}
		return nil, fmt.Errorf("failed to open url: %w", err)
	}

	err = doc.SelectFirst("input[name='url']").Type(u.String())

	if err != nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to type url: %w", err)
	}

	err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()

	if err != nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to click submit: %w", err)
	}

	// wait for the page to load
	time.Sleep(5 * time.Second)

	select {
	case <-ctx.Done():
		fmt.Println("context already done before entering the loop:", ctx.Err())
		return nil, ctx.Err()
	default:
		fmt.Println("context not done yet")
		// Proceed with the loop
	}
	// now we are waiting for archive.ph to archive the page and redirect us to the archived page
	// the way we can tell this is happening is by checking the url of the page periodically
	// if the page path starts with /wip/ then we are still waiting
	// also periodically refresh the page just in case

	keepGoing := true
	for keepGoing {
		select {
		case <-ctx.Done():
			slog.Info("context done")
			keepGoing = false

		case <-time.NewTicker(5 * time.Second).C:
			archivedUrl, err := url.Parse(doc.URL())

			if err != nil {
				continue
			}

			fmt.Println("checking url:", archivedUrl.String())
			// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
			if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
				keepGoing = false
				break
			}
		}
	}

	return doc, doc.WaitForNetworkIdle(nil)
}

func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	return DefaultConfig.Archive(ctx, b, target)
}
added archive, megamillions, and powerball site logic 2024-12-23 03:18:50 -05:00			`package archive`

			`import (`
			`"context"`
			`"errors"`
			`"fmt"`
			`"io"`
			`"log/slog"`
			`"net/url"`
			`"strings"`
			`"time"`

			`"gitea.stevedudenhoeffer.com/steve/go-extractor"`
			`)`

			`type Config struct {`
			`// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.`
			`Endpoint string`

			`// Timeout will, if set, cancel any Archive call after this duration.`
			`// If nil, the default timeout of 1 hour will be used.`
			`Timeout *time.Duration // Timeout for the request, defaults to 1 hour`
			`}`

			`// validate validates the config and sets default values if necessary.`
			`func (c Config) validate() Config {`

			`if c.Timeout == nil {`
			`def := 1 * time.Hour`
			`c.Timeout = &def`
			`}`

			`if c.Endpoint == "" {`
			`c.Endpoint = "https://archive.ph"`
			`}`

			`return c`
			`}`

			`var DefaultConfig = Config{}`

			`func deferClose(cl io.Closer) {`
			`if cl != nil {`
			`_ = cl.Close()`
			`}`
			`}`

			`// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.`
			`func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {`
			`c = c.validate()`
			`u, err := url.Parse(target)`
			`if err != nil {`
			`return nil, fmt.Errorf("invalid url: %w", err)`
			`}`

			`endpoint, err := url.Parse(c.Endpoint)`
			`if err != nil {`
			`return nil, fmt.Errorf("invalid endpoint: %w", err)`
			`}`

			`uri := endpoint.JoinPath("/newest")`

			`uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()`

			`slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)`

			`doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{})`

			`if err != nil {`
			`if doc != nil {`
			`_ = doc.Close()`
			`}`
			`if errors.Is(err, extractor.ErrPageNotFound) {`
			`return nil, nil`
			`}`
			`return nil, fmt.Errorf("failed to open url: %w", err)`
			`}`

			`return doc, nil`
			`}`

			`func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {`
			`return DefaultConfig.IsArchived(ctx, b, target)`
			`}`

			`func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {`
			`c = c.validate()`
			`var cancel context.CancelFunc`

			`if c.Timeout != nil {`
			`ctx, cancel = context.WithTimeout(ctx, *c.Timeout)`
			`slog.Info("setting timeout", "timeout", *c.Timeout)`
			`defer cancel()`
			`}`
			`u, err := url.Parse(target)`
			`if err != nil {`
			`return nil, fmt.Errorf("invalid url: %w", err)`
			`}`

			`endpoint, err := url.Parse(c.Endpoint)`
			`if err != nil {`
			`return nil, fmt.Errorf("invalid endpoint: %w", err)`
			`}`

			`doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{})`

			`if err != nil {`
			`if doc != nil {`
			`_ = doc.Close()`
			`}`
			`return nil, fmt.Errorf("failed to open url: %w", err)`
			`}`

			`err = doc.SelectFirst("input[name='url']").Type(u.String())`

			`if err != nil {`
			`_ = doc.Close()`
			`return nil, fmt.Errorf("failed to type url: %w", err)`
			`}`

			`err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click()`

			`if err != nil {`
			`_ = doc.Close()`
			`return nil, fmt.Errorf("failed to click submit: %w", err)`
			`}`

			`// wait for the page to load`
			`time.Sleep(5 * time.Second)`

			`select {`
			`case <-ctx.Done():`
			`fmt.Println("context already done before entering the loop:", ctx.Err())`
			`return nil, ctx.Err()`
			`default:`
			`fmt.Println("context not done yet")`
			`// Proceed with the loop`
			`}`
			`// now we are waiting for archive.ph to archive the page and redirect us to the archived page`
			`// the way we can tell this is happening is by checking the url of the page periodically`
			`// if the page path starts with /wip/ then we are still waiting`
			`// also periodically refresh the page just in case`

			`keepGoing := true`
			`for keepGoing {`
			`select {`
			`case <-ctx.Done():`
			`slog.Info("context done")`
			`keepGoing = false`

			`case <-time.NewTicker(5 * time.Second).C:`
			`archivedUrl, err := url.Parse(doc.URL())`

			`if err != nil {`
			`continue`
			`}`

			`fmt.Println("checking url:", archivedUrl.String())`
			`// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done`
			`if archivedUrl.Hostname() != endpoint.Hostname() \|\| (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {`
			`keepGoing = false`
			`break`
			`}`
			`}`
			`}`

			`return doc, doc.WaitForNetworkIdle(nil)`
			`}`

			`func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {`
			`return DefaultConfig.Archive(ctx, b, target)`
			`}`