130 lines
2.4 KiB
Go
130 lines
2.4 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive"
|
|
|
|
"github.com/urfave/cli/v3"
|
|
)
|
|
|
|
type ArchiveFlags []cli.Flag
|
|
|
|
var Flags = ArchiveFlags{
|
|
&cli.StringFlag{
|
|
Name: "endpoint",
|
|
Usage: "Archive endpoint to use",
|
|
DefaultText: "https://archive.ph",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "timeout",
|
|
Usage: "Timeout for requests",
|
|
DefaultText: "10s",
|
|
},
|
|
}
|
|
|
|
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
|
|
c := archive.DefaultConfig
|
|
|
|
if e := cmd.String("endpoint"); e != "" {
|
|
c.Endpoint = e
|
|
}
|
|
|
|
if t := cmd.String("timeout"); t != "" {
|
|
d, err := time.ParseDuration(t)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
c.Timeout = &d
|
|
}
|
|
|
|
return c
|
|
}
|
|
|
|
func main() {
|
|
|
|
var flags []cli.Flag
|
|
|
|
flags = append(flags, browser.Flags...)
|
|
flags = append(flags, Flags...)
|
|
|
|
cli := &cli.Command{
|
|
Name: "archive",
|
|
Usage: "Archive a website",
|
|
Flags: Flags,
|
|
Action: func(ctx context.Context, cli *cli.Command) error {
|
|
|
|
target := cli.Args().First()
|
|
|
|
if target == "" {
|
|
return fmt.Errorf("usage: archive <url>")
|
|
}
|
|
|
|
b, err := browser.FromCommand(ctx, cli)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
doc, err := archive.IsArchived(ctx, b, target)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if doc == nil {
|
|
fmt.Println("Not archived")
|
|
|
|
doc, err = archive.Archive(ctx, b, target)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if doc == nil {
|
|
return fmt.Errorf("failed to archive")
|
|
}
|
|
}
|
|
|
|
defer func(doc extractor.Document) {
|
|
fmt.Println("Closing document", doc.URL())
|
|
err := doc.Close()
|
|
if err != nil {
|
|
fmt.Println("failed to close document", err)
|
|
}
|
|
}(doc)
|
|
|
|
fmt.Println("Archived at ", doc.URL())
|
|
|
|
article, err := extractor.Readability(ctx, doc)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fmt.Println("Title:", article.Title)
|
|
fmt.Println("Byline:", article.Byline)
|
|
fmt.Println("Site:", article.SiteName)
|
|
fmt.Println("Published:", article.PublishedTime)
|
|
fmt.Println("Excerpt:", article.Excerpt)
|
|
fmt.Println("Length:", article.Length)
|
|
fmt.Println("Lang:", article.Lang)
|
|
fmt.Println("Content:", article.Content[:32]+"...")
|
|
fmt.Println("TextContent:", article.TextContent)
|
|
return nil
|
|
},
|
|
}
|
|
|
|
err := cli.Run(context.Background(), os.Args)
|
|
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
}
|