Files
go-extractor/document.go
Steve Dudenhoeffer e7b7e78796
Some checks failed
CI / vet (push) Failing after 15s
CI / build (push) Failing after 30s
CI / test (push) Failing after 36s
fix: bug fixes, test coverage, and CI workflow
- Fix Nodes.First() panic on empty slice (return nil)
- Fix ticker leak in archive.go (create once, defer Stop)
- Fix cookie path matching for empty and root paths
- Fix lost query params in google.go (u.Query().Set was discarded)
- Fix type assertion panic in useragents.go
- Fix dropped date parse error in powerball.go
- Remove unreachable dead code in megamillions.go and powerball.go
- Simplify document.go WaitForNetworkIdle, remove unused root field
- Remove debug fmt.Println calls across codebase
- Replace panic(err) with stderr+exit in all cmd/ programs
- Fix duckduckgo cmd: remove useless defer, return error on bad safesearch
- Fix archive cmd: ToConfig returns error instead of panicking
- Add 39+ unit tests across 6 new test files
- Add Gitea Actions CI workflow (build, test, vet in parallel)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 11:14:19 -05:00

84 lines
1.5 KiB
Go

package extractor
import (
"fmt"
"io"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type Document interface {
io.Closer
Node
URL() string
Refresh() error
Content() (string, error)
WaitForNetworkIdle(timeout *time.Duration) error
}
type document struct {
node
pw *playwright.Playwright
browser playwright.Browser
page playwright.Page
locator playwright.Locator
}
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
locator := page.Locator("html")
res := &document{
node: node{
locator: locator,
},
pw: pw,
browser: browser,
page: page,
}
slog.Info("new document", "url", page.URL(), "locator", locator)
return res, nil
}
func (d *document) Close() error {
return d.page.Close()
}
func (d *document) URL() string {
return d.page.URL()
}
func (d *document) Content() (string, error) {
return d.page.Content()
}
func (d *document) Refresh() error {
resp, err := d.page.Reload()
if err != nil {
return fmt.Errorf("failed to reload page: %w", err)
}
if resp.Status() != 200 {
return fmt.Errorf("invalid status code: %d", resp.Status())
}
return nil
}
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
if timeout == nil {
t := 30 * time.Second
timeout = &t
}
ms := float64(timeout.Milliseconds())
return d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: &ms,
})
}