changed browser api to return pages that can be acted on, not strictly contents
This commit is contained in:
parent
23334991b1
commit
5e924eb3f9
17
browser.go
17
browser.go
@ -5,23 +5,8 @@ import (
|
||||
"io"
|
||||
)
|
||||
|
||||
type ScreenshotStyle string
|
||||
|
||||
const (
|
||||
ScreenshotStyleFullPage ScreenshotStyle = "full"
|
||||
ScreenshotStyleViewport ScreenshotStyle = "viewport"
|
||||
)
|
||||
|
||||
type ScreenshotOptions struct {
|
||||
Style ScreenshotStyle
|
||||
Width int
|
||||
Height int
|
||||
}
|
||||
|
||||
type Browser interface {
|
||||
io.Closer
|
||||
|
||||
Open(ctx context.Context, url string) (Source, error)
|
||||
Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error)
|
||||
OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error)
|
||||
Open(ctx context.Context, url string) (Document, error)
|
||||
}
|
||||
|
18
cookiejar.go
18
cookiejar.go
@ -18,3 +18,21 @@ type CookieJar interface {
|
||||
Set(cookie Cookie) error
|
||||
Delete(cookie Cookie) error
|
||||
}
|
||||
|
||||
// ReadOnlyCookieJar is a wrapper for CookieJar that allows only read operations on cookies, but all
|
||||
// write operations are no-ops.
|
||||
type ReadOnlyCookieJar struct {
|
||||
Jar CookieJar
|
||||
}
|
||||
|
||||
func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) {
|
||||
return r.Jar.GetAll()
|
||||
}
|
||||
|
||||
func (r ReadOnlyCookieJar) Set(_ Cookie) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r ReadOnlyCookieJar) Delete(_ Cookie) error {
|
||||
return nil
|
||||
}
|
||||
|
98
document.go
Normal file
98
document.go
Normal file
@ -0,0 +1,98 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
type Document interface {
|
||||
io.Closer
|
||||
|
||||
Content() (string, error)
|
||||
Text() (string, error)
|
||||
Screenshot() ([]byte, error)
|
||||
|
||||
Select(selector string) Documents
|
||||
SelectFirst(selector string) Document
|
||||
|
||||
ForEach(selector string, fn func(Document) error) error
|
||||
}
|
||||
|
||||
type document struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
page playwright.Page
|
||||
root playwright.ElementHandle
|
||||
locator playwright.Locator
|
||||
}
|
||||
|
||||
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
||||
root, err := page.QuerySelector("html")
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
root2 := page.Locator("html")
|
||||
return document{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
page: page,
|
||||
locator: root2,
|
||||
root: root,
|
||||
}, nil
|
||||
}
|
||||
func (p document) Close() error {
|
||||
return p.page.Close()
|
||||
}
|
||||
|
||||
func (p document) Content() (string, error) {
|
||||
return p.locator.TextContent()
|
||||
}
|
||||
|
||||
func (p document) Text() (string, error) {
|
||||
return p.locator.InnerText()
|
||||
}
|
||||
|
||||
func (p document) Screenshot() ([]byte, error) {
|
||||
return p.locator.Screenshot()
|
||||
}
|
||||
|
||||
func (d document) Select(selector string) Documents {
|
||||
|
||||
elements, err := d.locator.Locator(selector).All()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
res := make(Documents, len(elements))
|
||||
for i, el := range elements {
|
||||
res[i] = document{
|
||||
pw: d.pw,
|
||||
browser: d.browser,
|
||||
page: d.page,
|
||||
locator: el,
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d document) SelectFirst(selector string) Document {
|
||||
return d.Select(selector)[0]
|
||||
}
|
||||
|
||||
func (d document) ForEach(selector string, fn func(Document) error) error {
|
||||
|
||||
e := d.Select(selector)
|
||||
|
||||
for _, el := range e {
|
||||
err := fn(el)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
32
documents.go
Normal file
32
documents.go
Normal file
@ -0,0 +1,32 @@
|
||||
package extractor
|
||||
|
||||
type Documents []Document
|
||||
|
||||
func (d Documents) Select(selector string) Documents {
|
||||
var res Documents
|
||||
|
||||
for _, doc := range d {
|
||||
res = append(res, doc.Select(selector)...)
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (d Documents) First() Document {
|
||||
return d[0]
|
||||
}
|
||||
|
||||
func (d Documents) ExtractText() ([]string, error) {
|
||||
var res []string
|
||||
|
||||
for _, doc := range d {
|
||||
text, err := doc.Text()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res = append(res, text)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
@ -11,6 +11,7 @@ import (
|
||||
)
|
||||
|
||||
type playWrightBrowser struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
ctx playwright.BrowserContext
|
||||
userAgent string
|
||||
@ -148,6 +149,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
}
|
||||
|
||||
return playWrightBrowser{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
userAgent: opt.UserAgent,
|
||||
timeout: *opt.Timeout,
|
||||
@ -192,7 +194,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Info("opened page", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||
|
||||
if resp.Status() != 200 {
|
||||
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
||||
@ -202,7 +204,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
||||
return page, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) {
|
||||
func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) {
|
||||
|
||||
page, err := b.openPage(ctx, url)
|
||||
if err != nil {
|
||||
@ -210,86 +212,12 @@ func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error)
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
text, err := page.Content()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = b.updateCookies(ctx, page)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return source{
|
||||
sourceUrl: url,
|
||||
content: text,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) getScreenshot(_ context.Context, page playwright.Page, opts ScreenshotOptions) ([]byte, error) {
|
||||
var pwOpts playwright.PageScreenshotOptions
|
||||
|
||||
if opts.Style == "" {
|
||||
opts.Style = ScreenshotStyleFullPage
|
||||
}
|
||||
|
||||
if opts.Style == ScreenshotStyleFullPage {
|
||||
pwOpts.FullPage = playwright.Bool(true)
|
||||
} else if opts.Style == ScreenshotStyleViewport {
|
||||
pwOpts.FullPage = playwright.Bool(false)
|
||||
|
||||
if opts.Width > 0 || opts.Height > 0 {
|
||||
pwOpts.Clip = &playwright.Rect{
|
||||
Width: float64(opts.Width),
|
||||
Height: float64(opts.Height),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return page.Screenshot(pwOpts)
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) {
|
||||
page, err := b.openPage(ctx, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
err = b.updateCookies(ctx, page)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return b.getScreenshot(ctx, page, opts)
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) {
|
||||
page, err := b.openPage(ctx, url)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
text, err := page.Content()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
screenshot, err := b.getScreenshot(ctx, page, opts)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
err = b.updateCookies(ctx, page)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return source{
|
||||
sourceUrl: url,
|
||||
content: text,
|
||||
}, screenshot, nil
|
||||
return newDocument(b.pw, b.browser, page)
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Close() error {
|
||||
|
Loading…
x
Reference in New Issue
Block a user