go-extractor/document.go

99 lines
1.7 KiB
Go
Raw Permalink Normal View History

package extractor
import (
"fmt"
"io"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type Document interface {
io.Closer
Node
URL() string
Refresh() error
Content() (string, error)
WaitForNetworkIdle(timeout *time.Duration) error
}
type document struct {
node
pw *playwright.Playwright
browser playwright.Browser
page playwright.Page
root playwright.ElementHandle
locator playwright.Locator
}
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
root, err := page.QuerySelector("html")
if err != nil {
return nil, err
}
root2 := page.Locator("html")
res := &document{
node: node{
locator: root2,
},
pw: pw,
browser: browser,
page: page,
root: root,
}
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
return res, nil
}
func (d *document) Close() error {
return d.page.Close()
}
func (d *document) URL() string {
return d.page.URL()
}
func (d *document) Content() (string, error) {
return d.page.Content()
}
func (d *document) Refresh() error {
resp, err := d.page.Reload()
if err != nil {
return fmt.Errorf("failed to reload page: %w", err)
}
if resp.Status() != 200 {
return fmt.Errorf("invalid status code: %d", resp.Status())
}
return nil
}
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
var f *float64 = nil
if timeout == nil {
t := 30 * time.Second
timeout = &t
}
if timeout != nil {
ms := float64(timeout.Milliseconds())
f = &ms
}
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: f,
})
return err
}