diff --git a/browser.go b/browser.go index 8e27b77..06742ad 100644 --- a/browser.go +++ b/browser.go @@ -5,8 +5,12 @@ import ( "io" ) +type OpenPageOptions struct { + Referer string +} + type Browser interface { io.Closer - Open(ctx context.Context, url string) (Document, error) + Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) } diff --git a/cmd/browser/main.go b/cmd/browser/main.go new file mode 100644 index 0000000..3eb6ebc --- /dev/null +++ b/cmd/browser/main.go @@ -0,0 +1,79 @@ +package main + +import ( + "context" + "fmt" + "io" + "os" + + "github.com/urfave/cli/v3" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" +) + +func deferClose(cl io.Closer) { + _ = cl.Close() +} +func main() { + cmd := &cli.Command{ + Name: "browser", + Flags: browser.Flags, + Usage: "", + Action: func(ctx context.Context, cli *cli.Command) error { + target := cli.Args().First() + if target == "" { + return fmt.Errorf("no url specified") + } + + b, err := browser.FromCommand(ctx, cli) + if err != nil { + return err + } + + defer deferClose(b) + + // now open the user specified url + doc, err := b.Open(ctx, target, extractor.OpenPageOptions{}) + if err != nil { + return err + } + + defer deferClose(doc) + + article, err := extractor.Readability(ctx, doc) + + if err != nil { + return err + } + + content := "" + + if article.Content != "" { + + if len(article.Content) > 32 { + content = article.Content[:32] + "..." + } else { + content = article.Content + } + } + + fmt.Println("Title:", article.Title) + fmt.Println("Byline:", article.Byline) + fmt.Println("Site:", article.SiteName) + fmt.Println("Published:", article.PublishedTime) + fmt.Println("Excerpt:", article.Excerpt) + fmt.Println("Length:", article.Length) + fmt.Println("Lang:", article.Lang) + fmt.Println("Content:", content) + fmt.Println("TextContent:", article.TextContent) + return nil + }, + } + + err := cmd.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } +} diff --git a/cmd/browser/pkg/browser/flags.go b/cmd/browser/pkg/browser/flags.go new file mode 100644 index 0000000..a4672e0 --- /dev/null +++ b/cmd/browser/pkg/browser/flags.go @@ -0,0 +1,76 @@ +package browser + +import ( + "context" + "time" + + "github.com/urfave/cli/v3" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +type BrowserFlags []cli.Flag + +var Flags = BrowserFlags{ + &cli.StringFlag{ + Name: "user-agent", + Aliases: []string{"ua"}, + Usage: "User-Agent to use for requests", + DefaultText: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + }, + &cli.StringFlag{ + Name: "timeout", + Aliases: []string{"t"}, + Usage: "Timeout for requests", + DefaultText: "30s", + }, + &cli.StringFlag{ + Name: "browser", + Aliases: []string{"b"}, + Usage: "Browser to use, one of: chromium, firefox, webkit", + DefaultText: "firefox", + }, + &cli.StringFlag{ + Name: "cookies-file", + Aliases: []string{"c"}, + Usage: "cookies.txt file to load cookies from", + DefaultText: "", + }, + &cli.BoolFlag{ + Name: "visible", + Usage: "If set, the browser will be visible, if not set, the browser will be headless", + DefaultText: "false", + }, +} + +func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) { + var opts extractor.PlayWrightBrowserOptions + + if ua := cmd.String("user-agent"); ua != "" { + opts.UserAgent = ua + } + + if to := cmd.String("timeout"); to != "" { + d, err := time.ParseDuration(to) + if err != nil { + return nil, err + } + opts.Timeout = &d + } + + if b := cmd.String("browser"); b != "" { + opts.Browser = extractor.PlayWrightBrowserSelection(b) + } + + if cf := cmd.String("cookies-file"); cf != "" { + cookies, err := extractor.LoadCookiesFile(cf) + if err != nil { + return nil, err + } + opts.CookieJar = cookies + } + + opts.ShowBrowser = cmd.Bool("visible") + + return extractor.NewPlayWrightBrowser(opts) +} diff --git a/cookiejar.go b/cookiejar.go index 90af1a3..f08aa0f 100644 --- a/cookiejar.go +++ b/cookiejar.go @@ -1,20 +1,58 @@ package extractor import ( + "net/url" + "strings" "time" ) type Cookie struct { - Name string - Value string - Domain string + Host string Path string Expires time.Time Secure bool HttpOnly bool + Name string + Value string } + +func (c Cookie) IsTargetMatch(target string) (bool, error) { + u, err := url.Parse(target) + if err != nil { + return false, err + } + + // the host of the cookie is the same as the host of the target + // if the cookie host starts with a dot, that means it matches any subdomain + if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) { + if c.Path != "" { + if !strings.HasPrefix(u.Path, c.Path) { + return false, nil + } + + // if the cookie path is a prefix of the target path, then it's a match + // so now these would both match: + // cookie path: /foo + // target path: /foo/bar + // cookie path: /foo + // target path: /foosball + // because foseball is not an actual match, we need to check to see that either the path is an exact match + // or that the next character in the target path is a slash + + if len(u.Path) > len(c.Path) && u.Path[len(c.Path)] != '/' { + return false, nil + } + + return true, nil + } + } + + return false, nil +} + type CookieJar interface { GetAll() ([]Cookie, error) + Get(url string) ([]Cookie, error) Set(cookie Cookie) error Delete(cookie Cookie) error } @@ -29,6 +67,10 @@ func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) { return r.Jar.GetAll() } +func (r ReadOnlyCookieJar) Get(url string) ([]Cookie, error) { + return r.Jar.Get(url) +} + func (r ReadOnlyCookieJar) Set(_ Cookie) error { return nil } diff --git a/cookies_txt.go b/cookies_txt.go new file mode 100644 index 0000000..112af97 --- /dev/null +++ b/cookies_txt.go @@ -0,0 +1,106 @@ +package extractor + +import ( + "bufio" + "io" + "os" + "strconv" + "strings" + "time" +) + +type staticCookieJar []Cookie + +// GetAll will return all cookies in the jar. +func (s *staticCookieJar) GetAll() ([]Cookie, error) { + return *s, nil +} + +// Get will, given a URL, return all cookies that are valid for that URL. +func (s *staticCookieJar) Get(target string) ([]Cookie, error) { + var validCookies []Cookie + + for _, cookie := range *s { + if match, err := cookie.IsTargetMatch(target); err != nil { + return nil, err + } else if match { + validCookies = append(validCookies, cookie) + } + } + + return validCookies, nil +} + +func (s *staticCookieJar) Set(cookie Cookie) error { + // see if the cookie already exists + for i, c := range *s { + if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path { + (*s)[i] = cookie + return nil + } + } + + *s = append(*s, cookie) + return nil +} + +func (s *staticCookieJar) Delete(cookie Cookie) error { + for i, c := range *s { + if c.Name == cookie.Name && c.Host == cookie.Host && c.Path == cookie.Path { + *s = append((*s)[:i], (*s)[i+1:]...) + return nil + } + } + + return nil +} + +// LoadCookiesFile loads cookies from a file, in the format of cookies.txt. +func LoadCookiesFile(path string) (CookieJar, error) { + fp, err := os.Open(path) + if err != nil { + return nil, err + } + + defer func(cl io.Closer) { + _ = cl.Close() + }(fp) + + var cookies staticCookieJar + + scanner := bufio.NewScanner(fp) + + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + + if line[0] == '#' { + continue + } + + parts := strings.Split(line, "\t") + + if len(parts) < 7 { + continue + } + + expiry, err := strconv.ParseInt(parts[4], 10, 64) + if err != nil { + expiry = time.Now().Add(180 * 24 * time.Hour).Unix() // Default expiry + } + + cookies = append(cookies, Cookie{ + Host: parts[0], + HttpOnly: strings.ToLower(parts[1]) == "true", + Path: parts[2], + Secure: strings.ToLower(parts[3]) == "true", + Name: parts[5], + Expires: time.Unix(expiry, 0), + Value: parts[6], + }) + } + + return &cookies, nil +} diff --git a/document.go b/document.go index fdedd77..48f6b3a 100644 --- a/document.go +++ b/document.go @@ -1,25 +1,27 @@ package extractor import ( + "fmt" "io" + "log/slog" + "time" "github.com/playwright-community/playwright-go" ) type Document interface { io.Closer + Node + URL() string + Refresh() error Content() (string, error) - Text() (string, error) - Screenshot() ([]byte, error) - Select(selector string) Documents - SelectFirst(selector string) Document - - ForEach(selector string, fn func(Document) error) error + WaitForNetworkIdle(timeout *time.Duration) error } type document struct { + node pw *playwright.Playwright browser playwright.Browser page playwright.Page @@ -35,64 +37,62 @@ func newDocument(pw *playwright.Playwright, browser playwright.Browser, page pla } root2 := page.Locator("html") - return document{ + + res := &document{ + node: node{ + locator: root2, + }, pw: pw, browser: browser, page: page, - locator: root2, root: root, - }, nil + } + + slog.Info("new document", "url", page.URL(), "root", root, "locator", root2) + + return res, nil } -func (p document) Close() error { - return p.page.Close() +func (d *document) Close() error { + return d.page.Close() } -func (p document) Content() (string, error) { - return p.locator.TextContent() +func (d *document) URL() string { + return d.page.URL() } -func (p document) Text() (string, error) { - return p.locator.InnerText() +func (d *document) Content() (string, error) { + return d.page.Content() } -func (p document) Screenshot() ([]byte, error) { - return p.locator.Screenshot() -} - -func (d document) Select(selector string) Documents { - - elements, err := d.locator.Locator(selector).All() +func (d *document) Refresh() error { + resp, err := d.page.Reload() if err != nil { - return nil + return fmt.Errorf("failed to reload page: %w", err) } - res := make(Documents, len(elements)) - for i, el := range elements { - res[i] = document{ - pw: d.pw, - browser: d.browser, - page: d.page, - locator: el, - } - } - - return res -} - -func (d document) SelectFirst(selector string) Document { - return d.Select(selector)[0] -} - -func (d document) ForEach(selector string, fn func(Document) error) error { - - e := d.Select(selector) - - for _, el := range e { - err := fn(el) - if err != nil { - return err - } + if resp.Status() != 200 { + return fmt.Errorf("invalid status code: %d", resp.Status()) } return nil } + +func (d *document) WaitForNetworkIdle(timeout *time.Duration) error { + + var f *float64 = nil + if timeout == nil { + t := 30 * time.Second + timeout = &t + } + + if timeout != nil { + ms := float64(timeout.Milliseconds()) + f = &ms + } + + err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ + State: playwright.LoadStateNetworkidle, + Timeout: f, + }) + return err +} diff --git a/documents.go b/documents.go deleted file mode 100644 index 735aef6..0000000 --- a/documents.go +++ /dev/null @@ -1,32 +0,0 @@ -package extractor - -type Documents []Document - -func (d Documents) Select(selector string) Documents { - var res Documents - - for _, doc := range d { - res = append(res, doc.Select(selector)...) - } - - return res -} - -func (d Documents) First() Document { - return d[0] -} - -func (d Documents) ExtractText() ([]string, error) { - var res []string - - for _, doc := range d { - text, err := doc.Text() - if err != nil { - return nil, err - } - - res = append(res, text) - } - - return res, nil -} diff --git a/extractor.go b/extractor.go deleted file mode 100644 index a112fe6..0000000 --- a/extractor.go +++ /dev/null @@ -1,7 +0,0 @@ -package extractor - -import "context" - -type Extractor interface { - Extract(ctx context.Context, src Source) (Article, error) -} diff --git a/go.mod b/go.mod index f73bdb9..94a389d 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect + github.com/urfave/cli/v3 v3.0.0-beta1 // indirect golang.org/x/net v0.32.0 // indirect golang.org/x/text v0.21.0 // indirect ) diff --git a/node.go b/node.go new file mode 100644 index 0000000..ecfba2b --- /dev/null +++ b/node.go @@ -0,0 +1,81 @@ +package extractor + +import ( + "github.com/playwright-community/playwright-go" +) + +type Node interface { + Content() (string, error) + Text() (string, error) + Attr(name string) (string, error) + Screenshot() ([]byte, error) + + Type(input string) error + Click() error + + Select(selector string) Nodes + SelectFirst(selector string) Node + + ForEach(selector string, fn func(Node) error) error +} + +type node struct { + locator playwright.Locator +} + +func (n node) Type(input string) error { + return n.locator.Type(input) +} + +func (n node) Click() error { + return n.locator.Click() +} + +func (n node) Content() (string, error) { + return n.locator.TextContent() +} + +func (n node) Text() (string, error) { + return n.locator.InnerText() +} + +func (n node) Attr(name string) (string, error) { + return n.locator.GetAttribute(name) +} + +func (n node) Screenshot() ([]byte, error) { + return n.locator.Screenshot() +} + +func (n node) Select(selector string) Nodes { + elements, err := n.locator.Locator(selector).All() + if err != nil { + return nil + } + + var nodes Nodes + for _, element := range elements { + nodes = append(nodes, node{locator: element}) + } + + return nodes +} + +func (n node) SelectFirst(selector string) Node { + return n.Select(selector).First() +} + +func (n node) ForEach(selector string, fn func(Node) error) error { + elements, err := n.locator.Locator(selector).All() + if err != nil { + return err + } + + for _, element := range elements { + if err := fn(node{locator: element}); err != nil { + return err + } + } + + return nil +} diff --git a/nodes.go b/nodes.go new file mode 100644 index 0000000..b88b5ec --- /dev/null +++ b/nodes.go @@ -0,0 +1,32 @@ +package extractor + +type Nodes []Node + +func (n Nodes) Select(selector string) Nodes { + var res Nodes + + for _, node := range n { + res = append(res, node.Select(selector)...) + } + + return res +} + +func (d Nodes) First() Node { + return d[0] +} + +func (d Nodes) ExtractText() ([]string, error) { + var res []string + + for _, doc := range d { + text, err := doc.Text() + if err != nil { + return nil, err + } + + res = append(res, text) + } + + return res, nil +} diff --git a/playwright.go b/playwright.go index 779868d..123d672 100644 --- a/playwright.go +++ b/playwright.go @@ -25,6 +25,7 @@ type PlayWrightBrowserSelection string var ( ErrInvalidBrowserSelection = errors.New("invalid browser selection") + ErrPageNotFound = errors.New("page not found") ErrInvalidStatusCode = errors.New("invalid status code") ) @@ -35,20 +36,22 @@ const ( ) type PlayWrightBrowserOptions struct { - UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" + UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0" Browser PlayWrightBrowserSelection // If unset defaults to Firefox. Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the // browser into the cookie jar for each request. CookieJar + + ShowBrowser bool // If false, browser will be headless } func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { return playwright.OptionalCookie{ Name: cookie.Name, Value: cookie.Value, - Domain: playwright.String(cookie.Domain), + Domain: playwright.String(cookie.Host), Path: playwright.String(cookie.Path), Expires: playwright.Float(float64(cookie.Expires.Unix())), HttpOnly: playwright.Bool(cookie.HttpOnly), @@ -59,7 +62,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { return Cookie{ Name: cookie.Name, Value: cookie.Value, - Domain: cookie.Domain, + Host: cookie.Domain, Path: cookie.Path, Expires: time.Unix(int64(cookie.Expires), 0), HttpOnly: cookie.HttpOnly, @@ -69,7 +72,7 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { var thirtySeconds = 30 * time.Second opt := PlayWrightBrowserOptions{ - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", + UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", Browser: PlayWrightBrowserSelectionFirefox, Timeout: &thirtySeconds, } @@ -87,16 +90,23 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { if o.CookieJar != nil { opt.CookieJar = o.CookieJar } - } - - err := playwright.Install() - if err != nil { - return nil, err + opt.ShowBrowser = o.ShowBrowser } pw, err := playwright.Run() + if err != nil { - return nil, err + err = playwright.Install() + + if err != nil { + return nil, err + } + + pw, err = playwright.Run() + + if err != nil { + return nil, err + } } var bt playwright.BrowserType @@ -116,7 +126,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { } browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ - Headless: playwright.Bool(true), + Headless: playwright.Bool(!opt.ShowBrowser), }) if err != nil { return nil, err @@ -175,21 +185,26 @@ func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page return nil } -func (b playWrightBrowser) openPage(_ context.Context, target string) (playwright.Page, error) { +func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenPageOptions) (playwright.Page, error) { page, err := b.ctx.NewPage() if err != nil { return nil, err } - opts := playwright.PageGotoOptions{ + pwOpts := playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateLoad, } if b.timeout > 0 { var ms = float64(b.timeout.Milliseconds()) - opts.Timeout = &ms + pwOpts.Timeout = &ms } - resp, err := page.Goto(target, opts) + + if opts.Referer != "" { + pwOpts.Referer = playwright.String(opts.Referer) + } + + resp, err := page.Goto(target, pwOpts) if err != nil { return nil, err } @@ -197,6 +212,14 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() != 200 { + time.Sleep(999 * time.Hour * 24) + time.Sleep(25 * time.Second) + + _ = page.Close() + + if resp.Status() == 404 { + return nil, ErrPageNotFound + } slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status()) } @@ -204,13 +227,12 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh return page, nil } -func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) { +func (b playWrightBrowser) Open(ctx context.Context, url string, opts OpenPageOptions) (Document, error) { - page, err := b.openPage(ctx, url) + page, err := b.openPage(ctx, url, opts) if err != nil { return nil, err } - defer page.Close() err = b.updateCookies(ctx, page) if err != nil { diff --git a/readability.go b/readability.go index 68ff195..74aff38 100644 --- a/readability.go +++ b/readability.go @@ -1,25 +1,26 @@ package extractor import ( + "bytes" "context" "net/url" "github.com/go-shiori/go-readability" ) -type Readability struct { - Extractor -} +func Readability(_ context.Context, doc Document) (Article, error) { + data, err := doc.Content() + if err != nil { + return Article{}, err + } -var _ Extractor = Readability{} - -func (r Readability) Extract(_ context.Context, src Source) (Article, error) { - u, err := url.Parse(src.URL()) + u, err := url.Parse(doc.URL()) if err != nil { return Article{}, err } - a, err := readability.FromReader(src.Reader(), u) + + a, err := readability.FromReader(bytes.NewBufferString(data), u) if err != nil { return Article{}, err diff --git a/sites/archive/archive.go b/sites/archive/archive.go new file mode 100644 index 0000000..e7f4cd3 --- /dev/null +++ b/sites/archive/archive.go @@ -0,0 +1,172 @@ +package archive + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "net/url" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +type Config struct { + // Endpoint is the archive endpoint to use. If empty, archive.ph will be used. + Endpoint string + + // Timeout will, if set, cancel any Archive call after this duration. + // If nil, the default timeout of 1 hour will be used. + Timeout *time.Duration // Timeout for the request, defaults to 1 hour +} + +// validate validates the config and sets default values if necessary. +func (c Config) validate() Config { + + if c.Timeout == nil { + def := 1 * time.Hour + c.Timeout = &def + } + + if c.Endpoint == "" { + c.Endpoint = "https://archive.ph" + } + + return c +} + +var DefaultConfig = Config{} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} + +// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not. +func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { + c = c.validate() + u, err := url.Parse(target) + if err != nil { + return nil, fmt.Errorf("invalid url: %w", err) + } + + endpoint, err := url.Parse(c.Endpoint) + if err != nil { + return nil, fmt.Errorf("invalid endpoint: %w", err) + } + + uri := endpoint.JoinPath("/newest") + + uri.Path = strings.TrimSuffix(uri.Path, "/") + "/" + u.String() + + slog.Info("checking if url is archived", "url", uri.String(), "config", c, "endpoint", endpoint) + + doc, err := b.Open(ctx, uri.String(), extractor.OpenPageOptions{}) + + if err != nil { + if doc != nil { + _ = doc.Close() + } + if errors.Is(err, extractor.ErrPageNotFound) { + return nil, nil + } + return nil, fmt.Errorf("failed to open url: %w", err) + } + + return doc, nil +} + +func IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { + return DefaultConfig.IsArchived(ctx, b, target) +} + +func (c Config) Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { + c = c.validate() + var cancel context.CancelFunc + + if c.Timeout != nil { + ctx, cancel = context.WithTimeout(ctx, *c.Timeout) + slog.Info("setting timeout", "timeout", *c.Timeout) + defer cancel() + } + u, err := url.Parse(target) + if err != nil { + return nil, fmt.Errorf("invalid url: %w", err) + } + + endpoint, err := url.Parse(c.Endpoint) + if err != nil { + return nil, fmt.Errorf("invalid endpoint: %w", err) + } + + doc, err := b.Open(ctx, c.Endpoint, extractor.OpenPageOptions{}) + + if err != nil { + if doc != nil { + _ = doc.Close() + } + return nil, fmt.Errorf("failed to open url: %w", err) + } + + err = doc.SelectFirst("input[name='url']").Type(u.String()) + + if err != nil { + _ = doc.Close() + return nil, fmt.Errorf("failed to type url: %w", err) + } + + err = doc.SelectFirst("form#submiturl input[type=\"submit\"]").Click() + + if err != nil { + _ = doc.Close() + return nil, fmt.Errorf("failed to click submit: %w", err) + } + + // wait for the page to load + time.Sleep(5 * time.Second) + + select { + case <-ctx.Done(): + fmt.Println("context already done before entering the loop:", ctx.Err()) + return nil, ctx.Err() + default: + fmt.Println("context not done yet") + // Proceed with the loop + } + // now we are waiting for archive.ph to archive the page and redirect us to the archived page + // the way we can tell this is happening is by checking the url of the page periodically + // if the page path starts with /wip/ then we are still waiting + // also periodically refresh the page just in case + + keepGoing := true + for keepGoing { + select { + case <-ctx.Done(): + slog.Info("context done") + keepGoing = false + + case <-time.NewTicker(5 * time.Second).C: + archivedUrl, err := url.Parse(doc.URL()) + + if err != nil { + continue + } + + fmt.Println("checking url:", archivedUrl.String()) + // if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done + if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) { + keepGoing = false + break + } + } + } + + return doc, doc.WaitForNetworkIdle(nil) +} + +func Archive(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) { + return DefaultConfig.Archive(ctx, b, target) +} diff --git a/sites/archive/cmd/archive/main.go b/sites/archive/cmd/archive/main.go new file mode 100644 index 0000000..8a44897 --- /dev/null +++ b/sites/archive/cmd/archive/main.go @@ -0,0 +1,129 @@ +package main + +import ( + "context" + "fmt" + "os" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/archive" + + "github.com/urfave/cli/v3" +) + +type ArchiveFlags []cli.Flag + +var Flags = ArchiveFlags{ + &cli.StringFlag{ + Name: "endpoint", + Usage: "Archive endpoint to use", + DefaultText: "https://archive.ph", + }, + &cli.StringFlag{ + Name: "timeout", + Usage: "Timeout for requests", + DefaultText: "10s", + }, +} + +func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config { + c := archive.DefaultConfig + + if e := cmd.String("endpoint"); e != "" { + c.Endpoint = e + } + + if t := cmd.String("timeout"); t != "" { + d, err := time.ParseDuration(t) + if err != nil { + panic(err) + } + c.Timeout = &d + } + + return c +} + +func main() { + + var flags []cli.Flag + + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "archive", + Usage: "Archive a website", + Flags: Flags, + Action: func(ctx context.Context, cli *cli.Command) error { + + target := cli.Args().First() + + if target == "" { + return fmt.Errorf("usage: archive ") + } + + b, err := browser.FromCommand(ctx, cli) + if err != nil { + return err + } + + doc, err := archive.IsArchived(ctx, b, target) + + if err != nil { + return err + } + + if doc == nil { + fmt.Println("Not archived") + + doc, err = archive.Archive(ctx, b, target) + + if err != nil { + return err + } + + if doc == nil { + return fmt.Errorf("failed to archive") + } + } + + defer func(doc extractor.Document) { + fmt.Println("Closing document", doc.URL()) + err := doc.Close() + if err != nil { + fmt.Println("failed to close document", err) + } + }(doc) + + fmt.Println("Archived at ", doc.URL()) + + article, err := extractor.Readability(ctx, doc) + + if err != nil { + return err + } + + fmt.Println("Title:", article.Title) + fmt.Println("Byline:", article.Byline) + fmt.Println("Site:", article.SiteName) + fmt.Println("Published:", article.PublishedTime) + fmt.Println("Excerpt:", article.Excerpt) + fmt.Println("Length:", article.Length) + fmt.Println("Lang:", article.Lang) + fmt.Println("Content:", article.Content[:32]+"...") + fmt.Println("TextContent:", article.TextContent) + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } + +} diff --git a/sites/megamillions/cmd/megamillions.go b/sites/megamillions/cmd/megamillions.go new file mode 100644 index 0000000..81cde2b --- /dev/null +++ b/sites/megamillions/cmd/megamillions.go @@ -0,0 +1,60 @@ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/urfave/cli/v3" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/megamillions" +) + +type MegaMillionsFlags []cli.Flag + +var Flags = MegaMillionsFlags{} + +func (f MegaMillionsFlags) ToConfig(_ *cli.Command) megamillions.Config { + c := megamillions.DefaultConfig + return c +} + +func main() { + var flags []cli.Flag + + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "megamillions", + Usage: "Get MegaMillions information", + Flags: flags, + + Action: func(ctx context.Context, cli *cli.Command) error { + b, err := browser.FromCommand(ctx, cli) + + if err != nil { + return err + } + + draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b) + + if err != nil { + return err + } + + fmt.Printf("Drawing: %+v\n", draw) + fmt.Printf("Next Drawing: %+v\n", next) + + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } + +} diff --git a/sites/megamillions/megamillions.go b/sites/megamillions/megamillions.go new file mode 100644 index 0000000..4e9d414 --- /dev/null +++ b/sites/megamillions/megamillions.go @@ -0,0 +1,252 @@ +package megamillions + +import ( + "context" + "fmt" + "io" + "strconv" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + + "golang.org/x/text/currency" +) + +type Config struct{} + +var DefaultConfig = Config{} + +func (c Config) validate() Config { + return c +} + +type Drawing struct { + Date time.Time + Numbers [5]int + MegaBall int + Megaplier int +} + +type NextDrawing struct { + Date string + Jackpot currency.Amount +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} + +func netTicksToTime(t int64) time.Time { + return time.Unix(0, t*100).Add(-621355968000000000) +} + +func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) { + var drawing Drawing + + // the drawdate is stored as a .net ticks value in the data-playdateticks attribute of a + // span with the id of "lastestDate" + + date := doc.Select("span#lastestDate") + if len(date) != 1 { + return nil, fmt.Errorf("expected 1 date, got %d", len(date)) + } + + txt, err := date[0].Attr("data-playdateticks") + if err != nil { + return nil, fmt.Errorf("failed to get date: %w", err) + } + + ticks, err := strconv.ParseInt(txt, 10, 64) + if err != nil { + return nil, fmt.Errorf("failed to parse date: %w", err) + } + + fmt.Println("ticks", ticks) + drawing.Date = netTicksToTime(ticks) + + err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error { + classes, err := n.Attr("class") + + if err != nil { + return err + } + + txt, err := n.Text() + + if err != nil { + return err + } + + val, err := strconv.Atoi(txt) + + if err != nil { + return err + } + + if strings.Contains(classes, "winNum1") { + drawing.Numbers[0] = val + return nil + } + + if strings.Contains(classes, "winNum2") { + drawing.Numbers[1] = val + return nil + } + + if strings.Contains(classes, "winNum3") { + drawing.Numbers[2] = val + return nil + } + + if strings.Contains(classes, "winNum4") { + drawing.Numbers[3] = val + return nil + } + + if strings.Contains(classes, "winNum5") { + drawing.Numbers[4] = val + return nil + } + + if strings.Contains(classes, "winNumMB") { + drawing.MegaBall = val + return nil + } + return fmt.Errorf("unknown li.ball class: %s", classes) + }) + if err != nil { + return nil, fmt.Errorf("failed to get numbers: %w", err) + } + + megaplier := doc.Select("span.megaplier span.winNumMP") + + if len(megaplier) != 1 { + return nil, fmt.Errorf("expected 1 megaplier, got %d", len(megaplier)) + } + + // megaplier is in the format of "2X" or "3X" etc. + + txt, err = megaplier[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get megaplier: %w", err) + } + + val, err := strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", "")) + + if err != nil { + return nil, fmt.Errorf("failed to convert megaplier to int: %w", err) + } + drawing.Megaplier = val + + return &drawing, nil +} + +func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) { + var nextDrawing NextDrawing + + date := doc.Select("div.nextEstGroup span.nextDrawDate") + if len(date) != 1 { + return nil, fmt.Errorf("expected 1 date, got %d", len(date)) + } + + var err error + nextDrawing.Date, err = date[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get date: %w", err) + } + + jackpot := doc.Select("div.nextEstGroup span.nextEstVal") + + if len(jackpot) != 1 { + return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot)) + } + + txt, err := jackpot[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get jackpot: %w", err) + } + + // jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc + + // make one filter to only get the numeric part of the jackpot + + numericOnly := func(in string) float64 { + var out string + for _, r := range in { + if r >= '0' && r <= '9' { + out += string(r) + } + + if r == '.' { + out += string(r) + } + } + + val, err := strconv.ParseFloat(out, 64) + + if err != nil { + return 0 + } + + return val + } + + numeric := numericOnly(txt) + + set := false + if strings.Contains(txt, "Billion") { + amt := currency.USD.Amount(numeric * 1000000000) + nextDrawing.Jackpot = amt + set = true + } else if strings.Contains(txt, "Million") { + amt := currency.USD.Amount(numeric * 1000000) + nextDrawing.Jackpot = amt + set = true + } else { + amt := currency.USD.Amount(numeric) + nextDrawing.Jackpot = amt + set = true + } + + if !set { + return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err) + } + + return &nextDrawing, nil +} + +func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) { + c = c.validate() + + doc, err := b.Open(ctx, "https://www.megamillions.com/", extractor.OpenPageOptions{}) + + if err != nil { + return nil, nil, err + } + + defer deferClose(doc) + + d, err := getDrawing(ctx, doc) + + if err != nil { + return nil, nil, err + } + + nd, err := getNextDrawing(ctx, doc) + + if err != nil { + return nil, nil, err + } + + return d, nd, nil +} + +func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) { + return DefaultConfig.GetCurrent(ctx, b) +} diff --git a/sites/powerball/cmd/powerball.go b/sites/powerball/cmd/powerball.go new file mode 100644 index 0000000..5801add --- /dev/null +++ b/sites/powerball/cmd/powerball.go @@ -0,0 +1,60 @@ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/urfave/cli/v3" + + "gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser" + "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/powerball" +) + +type PowerballFlags []cli.Flag + +var Flags = PowerballFlags{} + +func (f PowerballFlags) ToConfig(_ *cli.Command) powerball.Config { + c := powerball.DefaultConfig + return c +} + +func main() { + var flags []cli.Flag + + flags = append(flags, browser.Flags...) + flags = append(flags, Flags...) + + cli := &cli.Command{ + Name: "powerball", + Usage: "Get Powerball information", + Flags: flags, + + Action: func(ctx context.Context, cli *cli.Command) error { + b, err := browser.FromCommand(ctx, cli) + + if err != nil { + return err + } + + draw, next, err := Flags.ToConfig(cli).GetCurrent(ctx, b) + + if err != nil { + return err + } + + fmt.Printf("Drawing: %+v\n", draw) + fmt.Printf("Next Drawing: %+v\n", next) + + return nil + }, + } + + err := cli.Run(context.Background(), os.Args) + + if err != nil { + panic(err) + } + +} diff --git a/sites/powerball/powerball.go b/sites/powerball/powerball.go new file mode 100644 index 0000000..1fd047c --- /dev/null +++ b/sites/powerball/powerball.go @@ -0,0 +1,216 @@ +package powerball + +import ( + "context" + "fmt" + "io" + "strconv" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + + "golang.org/x/text/currency" +) + +type Config struct { +} + +var DefaultConfig = Config{} + +func (c Config) validate() Config { + return c +} + +type Drawing struct { + Date time.Time + Numbers [5]int + PowerBall int + PowerPlay int +} + +type NextDrawing struct { + Date string + Jackpot currency.Amount +} + +func deferClose(cl io.Closer) { + if cl != nil { + _ = cl.Close() + } +} + +func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) { + var drawing Drawing + + nums := doc.Select("div.game-ball-group div.white-balls") + + if len(nums) != 5 { + return nil, fmt.Errorf("expected 5 white balls, got %d", len(nums)) + } + + for i, num := range nums { + txt, err := num.Text() + + if err != nil { + return nil, fmt.Errorf("failed to get white ball %d: %w", i, err) + } + + val, err := strconv.Atoi(txt) + + if err != nil { + return nil, fmt.Errorf("failed to convert white ball %d to int: %w", i, err) + } + drawing.Numbers[i] = val + } + + powerball := doc.Select("div.game-ball-group div.powerball") + + if len(powerball) != 1 { + return nil, fmt.Errorf("expected 1 powerball, got %d", len(powerball)) + } + + txt, err := powerball[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get powerball: %w", err) + } + + val, err := strconv.Atoi(txt) + + if err != nil { + return nil, fmt.Errorf("failed to convert powerball to int: %w", err) + } + + drawing.PowerBall = val + + powerplay := doc.Select("span.power-play span.multiplier") + + if len(powerplay) != 1 { + return nil, fmt.Errorf("expected 1 powerplay, got %d", len(powerplay)) + } + + // powerplay is in the format of "2X" or "3X" etc. + + txt, err = powerplay[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get powerplay: %w", err) + } + + val, err = strconv.Atoi(strings.ReplaceAll(strings.ReplaceAll(txt, "X", ""), "x", "")) + + if err != nil { + return nil, fmt.Errorf("failed to convert powerplay to int: %w", err) + } + drawing.PowerPlay = val + + return &drawing, nil +} + +func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, error) { + var nextDrawing NextDrawing + + date := doc.Select("div.next-powerball h5.title-date") + + if len(date) != 1 { + return nil, fmt.Errorf("expected 1 date, got %d", len(date)) + } + + var err error + nextDrawing.Date, err = date[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get date: %w", err) + } + + jackpot := doc.Select("div.next-powerball div.game-detail-group span.game-jackpot-number") + + if len(jackpot) != 1 { + return nil, fmt.Errorf("expected 1 jackpot, got %d", len(jackpot)) + } + + txt, err := jackpot[0].Text() + + if err != nil { + return nil, fmt.Errorf("failed to get jackpot: %w", err) + } + + // jackpot is in the format of "$1.5 billion", "$100 million", or "$200,000" etc + + // make one filter to only get the numeric part of the jackpot + + numericOnly := func(in string) float64 { + var out string + for _, r := range in { + if r >= '0' && r <= '9' { + out += string(r) + } + + if r == '.' { + out += string(r) + } + } + + val, err := strconv.ParseFloat(out, 64) + + if err != nil { + return 0 + } + + return val + } + + numeric := numericOnly(txt) + + set := false + if strings.Contains(txt, "Billion") { + amt := currency.USD.Amount(numeric * 1000000000) + nextDrawing.Jackpot = amt + set = true + } else if strings.Contains(txt, "Million") { + amt := currency.USD.Amount(numeric * 1000000) + nextDrawing.Jackpot = amt + set = true + } else { + amt := currency.USD.Amount(numeric) + nextDrawing.Jackpot = amt + set = true + } + + if !set { + return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err) + } + + return &nextDrawing, nil +} + +func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) { + c = c.validate() + + doc, err := b.Open(ctx, "https://www.powerball.com/", extractor.OpenPageOptions{}) + + if err != nil { + return nil, nil, err + } + + defer deferClose(doc) + + d, err := getDrawing(ctx, doc) + + if err != nil { + return nil, nil, err + } + + nd, err := getNextDrawing(ctx, doc) + + if err != nil { + return nil, nil, err + } + + return d, nd, nil +} + +func GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing, *NextDrawing, error) { + return DefaultConfig.GetCurrent(ctx, b) +}