When RemoveHidden is true, JavaScript is evaluated on the live page to remove all elements with computed display:none before readability extraction. This defends against anti-scraping honeypots that embed prompt injections in hidden DOM elements. The implementation uses an optional pageEvaluator interface so that the concrete document (backed by Playwright) supports it while the Document interface remains unchanged. Closes #62 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
87 lines
1.6 KiB
Go
87 lines
1.6 KiB
Go
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/playwright-community/playwright-go"
|
|
)
|
|
|
|
type Document interface {
|
|
io.Closer
|
|
Node
|
|
|
|
URL() string
|
|
Refresh() error
|
|
Content() (string, error)
|
|
|
|
WaitForNetworkIdle(timeout *time.Duration) error
|
|
}
|
|
|
|
type document struct {
|
|
node
|
|
pw *playwright.Playwright
|
|
browser playwright.Browser
|
|
page playwright.Page
|
|
}
|
|
|
|
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
|
locator := page.Locator("html")
|
|
|
|
res := &document{
|
|
node: node{
|
|
locator: locator,
|
|
},
|
|
pw: pw,
|
|
browser: browser,
|
|
page: page,
|
|
}
|
|
|
|
slog.Info("new document", "url", page.URL(), "locator", locator)
|
|
|
|
return res, nil
|
|
}
|
|
func (d *document) Close() error {
|
|
return d.page.Close()
|
|
}
|
|
|
|
func (d *document) URL() string {
|
|
return d.page.URL()
|
|
}
|
|
|
|
func (d *document) Content() (string, error) {
|
|
return d.page.Content()
|
|
}
|
|
|
|
func (d *document) Refresh() error {
|
|
resp, err := d.page.Reload()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to reload page: %w", err)
|
|
}
|
|
|
|
if resp != nil && resp.Status() != 200 {
|
|
return fmt.Errorf("invalid status code: %d", resp.Status())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (d *document) PageEvaluate(expression string) (interface{}, error) {
|
|
return d.page.Evaluate(expression)
|
|
}
|
|
|
|
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
|
if timeout == nil {
|
|
t := 30 * time.Second
|
|
timeout = &t
|
|
}
|
|
|
|
ms := float64(timeout.Milliseconds())
|
|
return d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
State: playwright.LoadStateNetworkidle,
|
|
Timeout: &ms,
|
|
})
|
|
}
|