added archive, megamillions, and powerball site logic
This commit is contained in:
98
document.go
98
document.go
@@ -1,25 +1,27 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
type Document interface {
|
||||
io.Closer
|
||||
Node
|
||||
|
||||
URL() string
|
||||
Refresh() error
|
||||
Content() (string, error)
|
||||
Text() (string, error)
|
||||
Screenshot() ([]byte, error)
|
||||
|
||||
Select(selector string) Documents
|
||||
SelectFirst(selector string) Document
|
||||
|
||||
ForEach(selector string, fn func(Document) error) error
|
||||
WaitForNetworkIdle(timeout *time.Duration) error
|
||||
}
|
||||
|
||||
type document struct {
|
||||
node
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
page playwright.Page
|
||||
@@ -35,64 +37,62 @@ func newDocument(pw *playwright.Playwright, browser playwright.Browser, page pla
|
||||
}
|
||||
|
||||
root2 := page.Locator("html")
|
||||
return document{
|
||||
|
||||
res := &document{
|
||||
node: node{
|
||||
locator: root2,
|
||||
},
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
page: page,
|
||||
locator: root2,
|
||||
root: root,
|
||||
}, nil
|
||||
}
|
||||
|
||||
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
|
||||
|
||||
return res, nil
|
||||
}
|
||||
func (p document) Close() error {
|
||||
return p.page.Close()
|
||||
func (d *document) Close() error {
|
||||
return d.page.Close()
|
||||
}
|
||||
|
||||
func (p document) Content() (string, error) {
|
||||
return p.locator.TextContent()
|
||||
func (d *document) URL() string {
|
||||
return d.page.URL()
|
||||
}
|
||||
|
||||
func (p document) Text() (string, error) {
|
||||
return p.locator.InnerText()
|
||||
func (d *document) Content() (string, error) {
|
||||
return d.page.Content()
|
||||
}
|
||||
|
||||
func (p document) Screenshot() ([]byte, error) {
|
||||
return p.locator.Screenshot()
|
||||
}
|
||||
|
||||
func (d document) Select(selector string) Documents {
|
||||
|
||||
elements, err := d.locator.Locator(selector).All()
|
||||
func (d *document) Refresh() error {
|
||||
resp, err := d.page.Reload()
|
||||
if err != nil {
|
||||
return nil
|
||||
return fmt.Errorf("failed to reload page: %w", err)
|
||||
}
|
||||
|
||||
res := make(Documents, len(elements))
|
||||
for i, el := range elements {
|
||||
res[i] = document{
|
||||
pw: d.pw,
|
||||
browser: d.browser,
|
||||
page: d.page,
|
||||
locator: el,
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d document) SelectFirst(selector string) Document {
|
||||
return d.Select(selector)[0]
|
||||
}
|
||||
|
||||
func (d document) ForEach(selector string, fn func(Document) error) error {
|
||||
|
||||
e := d.Select(selector)
|
||||
|
||||
for _, el := range e {
|
||||
err := fn(el)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if resp.Status() != 200 {
|
||||
return fmt.Errorf("invalid status code: %d", resp.Status())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
||||
|
||||
var f *float64 = nil
|
||||
if timeout == nil {
|
||||
t := 30 * time.Second
|
||||
timeout = &t
|
||||
}
|
||||
|
||||
if timeout != nil {
|
||||
ms := float64(timeout.Milliseconds())
|
||||
f = &ms
|
||||
}
|
||||
|
||||
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
Timeout: f,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
Reference in New Issue
Block a user