go-extractor/sites/archive/archive.go

package archive

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	"net/url"
	"strings"
	"time"

	"gitea.stevedudenhoeffer.com/steve/go-extractor"
)

type Config struct {
	// Endpoint is the archive endpoint to use. If empty, archive.ph will be used.
	Endpoint string

	// Timeout will, if set, cancel any Archive call after this duration.
	// If nil, the default timeout of 1 hour will be used.
	Timeout *time.Duration // Timeout for the request, defaults to 1 hour
}

// validate validates the config and sets default values if necessary.
func (c Config) validate() Config {

	if c.Timeout == nil {
		def := 1 * time.Hour
		c.Timeout = &def
	}

	if c.Endpoint == "" {
		c.Endpoint = "https://archive.ph"
	}

	return c
}

var DefaultConfig = Config{}

// IsArchived checks if a url is archived.  It returns the archived url if it is archived, or an empty string if it is not.
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	c = c.validate()
	u, err := url.Parse(target)
	if err != nil {
		return nil, fmt.Errorf("invalid url: %w", err)
	}

	endpoint, err := url.Parse(c.Endpoint)
	if err != nil {
		return nil, fmt.Errorf("invalid endpoint: %w", err)
	}

	uri := endpoint.JoinPath("/newest")

	uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String()

	slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint)

	doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{AllowNonOKStatus: true})

	if err != nil {
		if errors.Is(err, extractor.ErrPageNotFound) {
			if doc != nil {
				_ = doc.Close()
			}
			return nil, nil
		}
		// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
		// captcha) the page is kept open by AllowNonOKStatus so the caller
		// can promote it to an InteractiveBrowser and let a human solve
		// the challenge. Return both the doc and the wrapped error.
		if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
			return doc, fmt.Errorf("failed to open url: %w", err)
		}
		if doc != nil {
			_ = doc.Close()
		}
		return nil, fmt.Errorf("failed to open url: %w", err)
	}

	return doc, nil
}

func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	return DefaultConfig.IsArchived(ctx, b, target)
}

func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	c = c.validate()
	var cancel context.CancelFunc

	if c.Timeout != nil {
		ctx, cancel = context.WithTimeout(ctx, *c.Timeout)
		slog.Info("setting timeout", "timeout", *c.Timeout)
		defer cancel()
	}
	u, err := url.Parse(target)
	if err != nil {
		return nil, fmt.Errorf("invalid url: %w", err)
	}

	endpoint, err := url.Parse(c.Endpoint)
	if err != nil {
		return nil, fmt.Errorf("invalid endpoint: %w", err)
	}

	doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{AllowNonOKStatus: true})

	if err != nil {
		// On ErrInvalidStatusCode (e.g. archive.ph 403 with Cloudflare
		// captcha) the page is kept open by AllowNonOKStatus so the caller
		// can promote it. Return both the doc and the wrapped error.
		if errors.Is(err, extractor.ErrInvalidStatusCode) && doc != nil {
			return doc, fmt.Errorf("failed to open url: %w", err)
		}
		if doc != nil {
			_ = doc.Close()
		}
		return nil, fmt.Errorf("failed to open url: %w", err)
	}

	urlInput := doc.SelectFirst("input[name='url']")
	if urlInput == nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to find url input element")
	}

	err = urlInput.Type(u.String())
	if err != nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to type url: %w", err)
	}

	submitBtn := doc.SelectFirst("form#submiturl input[type=\"submit\"]")
	if submitBtn == nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to find submit button")
	}

	err = submitBtn.Click()
	if err != nil {
		_ = doc.Close()
		return nil, fmt.Errorf("failed to click submit: %w", err)
	}

	// wait for the page to load, but respect context cancellation
	select {
	case <-ctx.Done():
		slog.Debug("context done during initial wait", "err", ctx.Err())
		_ = doc.Close()
		return nil, ctx.Err()
	case <-time.After(5 * time.Second):
	}
	// now we are waiting for archive.ph to archive the page and redirect us to the archived page
	// the way we can tell this is happening is by checking the url of the page periodically
	// if the page path starts with /wip/ then we are still waiting
	// also periodically refresh the page just in case

	ticker := time.NewTicker(5 * time.Second)
	defer ticker.Stop()

	keepGoing := true
	for keepGoing {
		select {
		case <-ctx.Done():
			slog.Info("context done")
			keepGoing = false

		case <-ticker.C:
			archivedUrl, err := url.Parse(doc.URL())

			if err != nil {
				continue
			}

			slog.Debug("checking url", "url", archivedUrl.String())
			// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
			if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
				keepGoing = false
				break
			}
		}
	}

	return doc, doc.WaitForNetworkIdle(nil)
}

func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
	return DefaultConfig.Archive(ctx, b, target)
}