package archive import ( "context" "errors" "fmt" "io" "log/slog" "net/url" "strings" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" ) type Config struct { // Endpoint is the archive endpoint to use. If empty, archive.ph will be used. Endpoint string // Timeout will, if set, cancel any Archive call after this duration. // If nil, the default timeout of 1 hour will be used. Timeout *time.Duration // Timeout for the request, defaults to 1 hour } // validate validates the config and sets default values if necessary. func (c Config) validate() Config { if c.Timeout == nil { def := 1 * time.Hour c.Timeout = &def } if c.Endpoint == "" { c.Endpoint = "https://archive.ph" } return c } var DefaultConfig = Config{} func deferClose(cl io.Closer) { if cl != nil { _ = cl.Close() } } // IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not. func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() u, err := url.Parse(target) if err != nil { return nil, fmt.Errorf("invalid url: %w", err) } endpoint, err := url.Parse(c.Endpoint) if err != nil { return nil, fmt.Errorf("invalid endpoint: %w", err) } uri := endpoint.JoinPath("/newest") uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String() slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint) doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{}) if err != nil { if doc != nil { _ = doc.Close() } if errors.Is(err, extractor.ErrPageNotFound) { return nil, nil } return nil, fmt.Errorf("failed to open url: %w", err) } return doc, nil } func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.IsArchived(ctx, b, target) } func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { c = c.validate() var cancel context.CancelFunc if c.Timeout != nil { ctx, cancel = context.WithTimeout(ctx, *c.Timeout) slog.Info("setting timeout", "timeout", *c.Timeout) defer cancel() } u, err := url.Parse(target) if err != nil { return nil, fmt.Errorf("invalid url: %w", err) } endpoint, err := url.Parse(c.Endpoint) if err != nil { return nil, fmt.Errorf("invalid endpoint: %w", err) } doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{}) if err != nil { if doc != nil { _ = doc.Close() } return nil, fmt.Errorf("failed to open url: %w", err) } err = doc.SelectFirst("input[name='url']").Type(u.String()) if err != nil { _ = doc.Close() return nil, fmt.Errorf("failed to type url: %w", err) } err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click() if err != nil { _ = doc.Close() return nil, fmt.Errorf("failed to click submit: %w", err) } // wait for the page to load time.Sleep(5 * time.Second) select { case <-ctx.Done(): fmt.Println("context already done before entering the loop:", ctx.Err()) return nil, ctx.Err() default: fmt.Println("context not done yet") // Proceed with the loop } // now we are waiting for archive.ph to archive the page and redirect us to the archived page // the way we can tell this is happening is by checking the url of the page periodically // if the page path starts with /wip/ then we are still waiting // also periodically refresh the page just in case keepGoing := true for keepGoing { select { case <-ctx.Done(): slog.Info("context done") keepGoing = false case <-time.NewTicker(5 * time.Second).C: archivedUrl, err := url.Parse(doc.URL()) if err != nil { continue } fmt.Println("checking url:", archivedUrl.String()) // if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) { keepGoing = false break } } } return doc, doc.WaitForNetworkIdle(nil) } func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { return DefaultConfig.Archive(ctx, b, target) }