added archive, megamillions, and powerball site logic

This commit is contained in:
2024-12-23 03:18:50 -05:00
parent 5e924eb3f9
commit 567a9f9212
19 changed files with 1412 additions and 118 deletions

79
cmd/browser/main.go Normal file
View File

@@ -0,0 +1,79 @@
package main
import (
"context"
"fmt"
"io"
"os"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
)
func deferClose(cl io.Closer) {
_ = cl.Close()
}
func main() {
cmd := &cli.Command{
Name: "browser",
Flags: browser.Flags,
Usage: "<url>",
Action: func(ctx context.Context, cli *cli.Command) error {
target := cli.Args().First()
if target == "" {
return fmt.Errorf("no url specified")
}
b, err := browser.FromCommand(ctx, cli)
if err != nil {
return err
}
defer deferClose(b)
// now open the user specified url
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
if err != nil {
return err
}
defer deferClose(doc)
article, err := extractor.Readability(ctx, doc)
if err != nil {
return err
}
content := ""
if article.Content != "" {
if len(article.Content) > 32 {
content = article.Content[:32] + "..."
} else {
content = article.Content
}
}
fmt.Println("Title:", article.Title)
fmt.Println("Byline:", article.Byline)
fmt.Println("Site:", article.SiteName)
fmt.Println("Published:", article.PublishedTime)
fmt.Println("Excerpt:", article.Excerpt)
fmt.Println("Length:", article.Length)
fmt.Println("Lang:", article.Lang)
fmt.Println("Content:", content)
fmt.Println("TextContent:", article.TextContent)
return nil
},
}
err := cmd.Run(context.Background(), os.Args)
if err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,76 @@
package browser
import (
"context"
"time"
"github.com/urfave/cli/v3"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
type BrowserFlags []cli.Flag
var Flags = BrowserFlags{
&cli.StringFlag{
Name: "user-agent",
Aliases: []string{"ua"},
Usage: "User-Agent to use for requests",
DefaultText: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
},
&cli.StringFlag{
Name: "timeout",
Aliases: []string{"t"},
Usage: "Timeout for requests",
DefaultText: "30s",
},
&cli.StringFlag{
Name: "browser",
Aliases: []string{"b"},
Usage: "Browser to use, one of: chromium, firefox, webkit",
DefaultText: "firefox",
},
&cli.StringFlag{
Name: "cookies-file",
Aliases: []string{"c"},
Usage: "cookies.txt file to load cookies from",
DefaultText: "",
},
&cli.BoolFlag{
Name: "visible",
Usage: "If set, the browser will be visible, if not set, the browser will be headless",
DefaultText: "false",
},
}
func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) {
var opts extractor.PlayWrightBrowserOptions
if ua := cmd.String("user-agent"); ua != "" {
opts.UserAgent = ua
}
if to := cmd.String("timeout"); to != "" {
d, err := time.ParseDuration(to)
if err != nil {
return nil, err
}
opts.Timeout = &d
}
if b := cmd.String("browser"); b != "" {
opts.Browser = extractor.PlayWrightBrowserSelection(b)
}
if cf := cmd.String("cookies-file"); cf != "" {
cookies, err := extractor.LoadCookiesFile(cf)
if err != nil {
return nil, err
}
opts.CookieJar = cookies
}
opts.ShowBrowser = cmd.Bool("visible")
return extractor.NewPlayWrightBrowser(opts)
}