99 lines
1.7 KiB
Go
99 lines
1.7 KiB
Go
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/playwright-community/playwright-go"
|
|
)
|
|
|
|
type Document interface {
|
|
io.Closer
|
|
Node
|
|
|
|
URL() string
|
|
Refresh() error
|
|
Content() (string, error)
|
|
|
|
WaitForNetworkIdle(timeout *time.Duration) error
|
|
}
|
|
|
|
type document struct {
|
|
node
|
|
pw *playwright.Playwright
|
|
browser playwright.Browser
|
|
page playwright.Page
|
|
root playwright.ElementHandle
|
|
locator playwright.Locator
|
|
}
|
|
|
|
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
|
root, err := page.QuerySelector("html")
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
root2 := page.Locator("html")
|
|
|
|
res := &document{
|
|
node: node{
|
|
locator: root2,
|
|
},
|
|
pw: pw,
|
|
browser: browser,
|
|
page: page,
|
|
root: root,
|
|
}
|
|
|
|
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
|
|
|
|
return res, nil
|
|
}
|
|
func (d *document) Close() error {
|
|
return d.page.Close()
|
|
}
|
|
|
|
func (d *document) URL() string {
|
|
return d.page.URL()
|
|
}
|
|
|
|
func (d *document) Content() (string, error) {
|
|
return d.page.Content()
|
|
}
|
|
|
|
func (d *document) Refresh() error {
|
|
resp, err := d.page.Reload()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to reload page: %w", err)
|
|
}
|
|
|
|
if resp.Status() != 200 {
|
|
return fmt.Errorf("invalid status code: %d", resp.Status())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
|
|
|
var f *float64 = nil
|
|
if timeout == nil {
|
|
t := 30 * time.Second
|
|
timeout = &t
|
|
}
|
|
|
|
if timeout != nil {
|
|
ms := float64(timeout.Milliseconds())
|
|
f = &ms
|
|
}
|
|
|
|
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
State: playwright.LoadStateNetworkidle,
|
|
Timeout: f,
|
|
})
|
|
return err
|
|
}
|