From 5e924eb3f9d1f051ec894737aa21f3ce4eecff00 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Tue, 17 Dec 2024 23:16:13 -0500 Subject: [PATCH] changed browser api to return pages that can be acted on, not strictly contents --- browser.go | 17 +-------- cookiejar.go | 18 ++++++++++ document.go | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++ documents.go | 32 +++++++++++++++++ playwright.go | 82 +++--------------------------------------- 5 files changed, 154 insertions(+), 93 deletions(-) create mode 100644 document.go create mode 100644 documents.go diff --git a/browser.go b/browser.go index 73a91bf..8e27b77 100644 --- a/browser.go +++ b/browser.go @@ -5,23 +5,8 @@ import ( "io" ) -type ScreenshotStyle string - -const ( - ScreenshotStyleFullPage ScreenshotStyle = "full" - ScreenshotStyleViewport ScreenshotStyle = "viewport" -) - -type ScreenshotOptions struct { - Style ScreenshotStyle - Width int - Height int -} - type Browser interface { io.Closer - Open(ctx context.Context, url string) (Source, error) - Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) - OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) + Open(ctx context.Context, url string) (Document, error) } diff --git a/cookiejar.go b/cookiejar.go index 7f8d31d..90af1a3 100644 --- a/cookiejar.go +++ b/cookiejar.go @@ -18,3 +18,21 @@ type CookieJar interface { Set(cookie Cookie) error Delete(cookie Cookie) error } + +// ReadOnlyCookieJar is a wrapper for CookieJar that allows only read operations on cookies, but all +// write operations are no-ops. +type ReadOnlyCookieJar struct { + Jar CookieJar +} + +func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) { + return r.Jar.GetAll() +} + +func (r ReadOnlyCookieJar) Set(_ Cookie) error { + return nil +} + +func (r ReadOnlyCookieJar) Delete(_ Cookie) error { + return nil +} diff --git a/document.go b/document.go new file mode 100644 index 0000000..fdedd77 --- /dev/null +++ b/document.go @@ -0,0 +1,98 @@ +package extractor + +import ( + "io" + + "github.com/playwright-community/playwright-go" +) + +type Document interface { + io.Closer + + Content() (string, error) + Text() (string, error) + Screenshot() ([]byte, error) + + Select(selector string) Documents + SelectFirst(selector string) Document + + ForEach(selector string, fn func(Document) error) error +} + +type document struct { + pw *playwright.Playwright + browser playwright.Browser + page playwright.Page + root playwright.ElementHandle + locator playwright.Locator +} + +func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) { + root, err := page.QuerySelector("html") + + if err != nil { + return nil, err + } + + root2 := page.Locator("html") + return document{ + pw: pw, + browser: browser, + page: page, + locator: root2, + root: root, + }, nil +} +func (p document) Close() error { + return p.page.Close() +} + +func (p document) Content() (string, error) { + return p.locator.TextContent() +} + +func (p document) Text() (string, error) { + return p.locator.InnerText() +} + +func (p document) Screenshot() ([]byte, error) { + return p.locator.Screenshot() +} + +func (d document) Select(selector string) Documents { + + elements, err := d.locator.Locator(selector).All() + if err != nil { + return nil + } + + res := make(Documents, len(elements)) + for i, el := range elements { + res[i] = document{ + pw: d.pw, + browser: d.browser, + page: d.page, + locator: el, + } + } + + return res +} + +func (d document) SelectFirst(selector string) Document { + return d.Select(selector)[0] +} + +func (d document) ForEach(selector string, fn func(Document) error) error { + + e := d.Select(selector) + + for _, el := range e { + err := fn(el) + if err != nil { + return err + } + } + + return nil +} diff --git a/documents.go b/documents.go new file mode 100644 index 0000000..735aef6 --- /dev/null +++ b/documents.go @@ -0,0 +1,32 @@ +package extractor + +type Documents []Document + +func (d Documents) Select(selector string) Documents { + var res Documents + + for _, doc := range d { + res = append(res, doc.Select(selector)...) + } + + return res +} + +func (d Documents) First() Document { + return d[0] +} + +func (d Documents) ExtractText() ([]string, error) { + var res []string + + for _, doc := range d { + text, err := doc.Text() + if err != nil { + return nil, err + } + + res = append(res, text) + } + + return res, nil +} diff --git a/playwright.go b/playwright.go index 7840fe6..779868d 100644 --- a/playwright.go +++ b/playwright.go @@ -11,6 +11,7 @@ import ( ) type playWrightBrowser struct { + pw *playwright.Playwright browser playwright.Browser ctx playwright.BrowserContext userAgent string @@ -148,6 +149,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { } return playWrightBrowser{ + pw: pw, browser: browser, userAgent: opt.UserAgent, timeout: *opt.Timeout, @@ -192,7 +194,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh return nil, err } - slog.Info("opened page", "url", target, "status", resp.Status(), "request", resp.Request()) + slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request()) if resp.Status() != 200 { slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request()) @@ -202,7 +204,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh return page, nil } -func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) { +func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) { page, err := b.openPage(ctx, url) if err != nil { @@ -210,86 +212,12 @@ func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) } defer page.Close() - text, err := page.Content() - if err != nil { - return nil, err - } - - err = b.updateCookies(ctx, page) - if err != nil { - return nil, err - } - - return source{ - sourceUrl: url, - content: text, - }, nil -} - -func (b playWrightBrowser) getScreenshot(_ context.Context, page playwright.Page, opts ScreenshotOptions) ([]byte, error) { - var pwOpts playwright.PageScreenshotOptions - - if opts.Style == "" { - opts.Style = ScreenshotStyleFullPage - } - - if opts.Style == ScreenshotStyleFullPage { - pwOpts.FullPage = playwright.Bool(true) - } else if opts.Style == ScreenshotStyleViewport { - pwOpts.FullPage = playwright.Bool(false) - - if opts.Width > 0 || opts.Height > 0 { - pwOpts.Clip = &playwright.Rect{ - Width: float64(opts.Width), - Height: float64(opts.Height), - } - } - } - - return page.Screenshot(pwOpts) -} - -func (b playWrightBrowser) Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) { - page, err := b.openPage(ctx, url) - if err != nil { - return nil, err - } - defer page.Close() - err = b.updateCookies(ctx, page) if err != nil { return nil, err } - return b.getScreenshot(ctx, page, opts) -} - -func (b playWrightBrowser) OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) { - page, err := b.openPage(ctx, url) - if err != nil { - return nil, nil, err - } - defer page.Close() - - text, err := page.Content() - if err != nil { - return nil, nil, err - } - - screenshot, err := b.getScreenshot(ctx, page, opts) - if err != nil { - return nil, nil, err - } - - err = b.updateCookies(ctx, page) - if err != nil { - return nil, nil, err - } - - return source{ - sourceUrl: url, - content: text, - }, screenshot, nil + return newDocument(b.pw, b.browser, page) } func (b playWrightBrowser) Close() error {