package extractor import ( "fmt" "io" "log/slog" "time" "github.com/playwright-community/playwright-go" ) type Document interface { io.Closer Node URL() string Refresh() error Content() (string, error) WaitForNetworkIdle(timeout *time.Duration) error } type document struct { node pw *playwright.Playwright browser playwright.Browser page playwright.Page root playwright.ElementHandle locator playwright.Locator } func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) { root, err := page.QuerySelector("html") if err != nil { return nil, err } root2 := page.Locator("html") res := &document{ node: node{ locator: root2, }, pw: pw, browser: browser, page: page, root: root, } slog.Info("new document", "url", page.URL(), "root", root, "locator", root2) return res, nil } func (d *document) Close() error { return d.page.Close() } func (d *document) URL() string { return d.page.URL() } func (d *document) Content() (string, error) { return d.page.Content() } func (d *document) Refresh() error { resp, err := d.page.Reload() if err != nil { return fmt.Errorf("failed to reload page: %w", err) } if resp.Status() != 200 { return fmt.Errorf("invalid status code: %d", resp.Status()) } return nil } func (d *document) WaitForNetworkIdle(timeout *time.Duration) error { var f *float64 = nil if timeout == nil { t := 30 * time.Second timeout = &t } if timeout != nil { ms := float64(timeout.Milliseconds()) f = &ms } err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ State: playwright.LoadStateNetworkidle, Timeout: f, }) return err }