added archive, megamillions, and powerball site logic

This commit is contained in:
2024-12-23 03:18:50 -05:00
parent 5e924eb3f9
commit 567a9f9212
19 changed files with 1412 additions and 118 deletions

View File

@@ -1,25 +1,27 @@
package extractor
import (
"fmt"
"io"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type Document interface {
io.Closer
Node
URL() string
Refresh() error
Content() (string, error)
Text() (string, error)
Screenshot() ([]byte, error)
Select(selector string) Documents
SelectFirst(selector string) Document
ForEach(selector string, fn func(Document) error) error
WaitForNetworkIdle(timeout *time.Duration) error
}
type document struct {
node
pw *playwright.Playwright
browser playwright.Browser
page playwright.Page
@@ -35,64 +37,62 @@ func newDocument(pw *playwright.Playwright, browser playwright.Browser, page pla
}
root2 := page.Locator("html")
return document{
res := &document{
node: node{
locator: root2,
},
pw: pw,
browser: browser,
page: page,
locator: root2,
root: root,
}, nil
}
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
return res, nil
}
func (p document) Close() error {
return p.page.Close()
func (d *document) Close() error {
return d.page.Close()
}
func (p document) Content() (string, error) {
return p.locator.TextContent()
func (d *document) URL() string {
return d.page.URL()
}
func (p document) Text() (string, error) {
return p.locator.InnerText()
func (d *document) Content() (string, error) {
return d.page.Content()
}
func (p document) Screenshot() ([]byte, error) {
return p.locator.Screenshot()
}
func (d document) Select(selector string) Documents {
elements, err := d.locator.Locator(selector).All()
func (d *document) Refresh() error {
resp, err := d.page.Reload()
if err != nil {
return nil
return fmt.Errorf("failed to reload page: %w", err)
}
res := make(Documents, len(elements))
for i, el := range elements {
res[i] = document{
pw: d.pw,
browser: d.browser,
page: d.page,
locator: el,
}
}
return res
}
func (d document) SelectFirst(selector string) Document {
return d.Select(selector)[0]
}
func (d document) ForEach(selector string, fn func(Document) error) error {
e := d.Select(selector)
for _, el := range e {
err := fn(el)
if err != nil {
return err
}
if resp.Status() != 200 {
return fmt.Errorf("invalid status code: %d", resp.Status())
}
return nil
}
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
var f *float64 = nil
if timeout == nil {
t := 30 * time.Second
timeout = &t
}
if timeout != nil {
ms := float64(timeout.Milliseconds())
f = &ms
}
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: f,
})
return err
}