changed browser api to return pages that can be acted on, not strictly contents

This commit is contained in:
Steve Dudenhoeffer 2024-12-17 23:16:13 -05:00
parent 23334991b1
commit 5e924eb3f9
5 changed files with 154 additions and 93 deletions

View File

@ -5,23 +5,8 @@ import (
"io"
)
type ScreenshotStyle string
const (
ScreenshotStyleFullPage ScreenshotStyle = "full"
ScreenshotStyleViewport ScreenshotStyle = "viewport"
)
type ScreenshotOptions struct {
Style ScreenshotStyle
Width int
Height int
}
type Browser interface {
io.Closer
Open(ctx context.Context, url string) (Source, error)
Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error)
OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error)
Open(ctx context.Context, url string) (Document, error)
}

View File

@ -18,3 +18,21 @@ type CookieJar interface {
Set(cookie Cookie) error
Delete(cookie Cookie) error
}
// ReadOnlyCookieJar is a wrapper for CookieJar that allows only read operations on cookies, but all
// write operations are no-ops.
type ReadOnlyCookieJar struct {
Jar CookieJar
}
func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) {
return r.Jar.GetAll()
}
func (r ReadOnlyCookieJar) Set(_ Cookie) error {
return nil
}
func (r ReadOnlyCookieJar) Delete(_ Cookie) error {
return nil
}

98
document.go Normal file
View File

@ -0,0 +1,98 @@
package extractor
import (
"io"
"github.com/playwright-community/playwright-go"
)
type Document interface {
io.Closer
Content() (string, error)
Text() (string, error)
Screenshot() ([]byte, error)
Select(selector string) Documents
SelectFirst(selector string) Document
ForEach(selector string, fn func(Document) error) error
}
type document struct {
pw *playwright.Playwright
browser playwright.Browser
page playwright.Page
root playwright.ElementHandle
locator playwright.Locator
}
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
root, err := page.QuerySelector("html")
if err != nil {
return nil, err
}
root2 := page.Locator("html")
return document{
pw: pw,
browser: browser,
page: page,
locator: root2,
root: root,
}, nil
}
func (p document) Close() error {
return p.page.Close()
}
func (p document) Content() (string, error) {
return p.locator.TextContent()
}
func (p document) Text() (string, error) {
return p.locator.InnerText()
}
func (p document) Screenshot() ([]byte, error) {
return p.locator.Screenshot()
}
func (d document) Select(selector string) Documents {
elements, err := d.locator.Locator(selector).All()
if err != nil {
return nil
}
res := make(Documents, len(elements))
for i, el := range elements {
res[i] = document{
pw: d.pw,
browser: d.browser,
page: d.page,
locator: el,
}
}
return res
}
func (d document) SelectFirst(selector string) Document {
return d.Select(selector)[0]
}
func (d document) ForEach(selector string, fn func(Document) error) error {
e := d.Select(selector)
for _, el := range e {
err := fn(el)
if err != nil {
return err
}
}
return nil
}

32
documents.go Normal file
View File

@ -0,0 +1,32 @@
package extractor
type Documents []Document
func (d Documents) Select(selector string) Documents {
var res Documents
for _, doc := range d {
res = append(res, doc.Select(selector)...)
}
return res
}
func (d Documents) First() Document {
return d[0]
}
func (d Documents) ExtractText() ([]string, error) {
var res []string
for _, doc := range d {
text, err := doc.Text()
if err != nil {
return nil, err
}
res = append(res, text)
}
return res, nil
}

View File

@ -11,6 +11,7 @@ import (
)
type playWrightBrowser struct {
pw *playwright.Playwright
browser playwright.Browser
ctx playwright.BrowserContext
userAgent string
@ -148,6 +149,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
}
return playWrightBrowser{
pw: pw,
browser: browser,
userAgent: opt.UserAgent,
timeout: *opt.Timeout,
@ -192,7 +194,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
return nil, err
}
slog.Info("opened page", "url", target, "status", resp.Status(), "request", resp.Request())
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
if resp.Status() != 200 {
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
@ -202,7 +204,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
return page, nil
}
func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) {
func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) {
page, err := b.openPage(ctx, url)
if err != nil {
@ -210,86 +212,12 @@ func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error)
}
defer page.Close()
text, err := page.Content()
if err != nil {
return nil, err
}
err = b.updateCookies(ctx, page)
if err != nil {
return nil, err
}
return source{
sourceUrl: url,
content: text,
}, nil
}
func (b playWrightBrowser) getScreenshot(_ context.Context, page playwright.Page, opts ScreenshotOptions) ([]byte, error) {
var pwOpts playwright.PageScreenshotOptions
if opts.Style == "" {
opts.Style = ScreenshotStyleFullPage
}
if opts.Style == ScreenshotStyleFullPage {
pwOpts.FullPage = playwright.Bool(true)
} else if opts.Style == ScreenshotStyleViewport {
pwOpts.FullPage = playwright.Bool(false)
if opts.Width > 0 || opts.Height > 0 {
pwOpts.Clip = &playwright.Rect{
Width: float64(opts.Width),
Height: float64(opts.Height),
}
}
}
return page.Screenshot(pwOpts)
}
func (b playWrightBrowser) Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) {
page, err := b.openPage(ctx, url)
if err != nil {
return nil, err
}
defer page.Close()
err = b.updateCookies(ctx, page)
if err != nil {
return nil, err
}
return b.getScreenshot(ctx, page, opts)
}
func (b playWrightBrowser) OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) {
page, err := b.openPage(ctx, url)
if err != nil {
return nil, nil, err
}
defer page.Close()
text, err := page.Content()
if err != nil {
return nil, nil, err
}
screenshot, err := b.getScreenshot(ctx, page, opts)
if err != nil {
return nil, nil, err
}
err = b.updateCookies(ctx, page)
if err != nil {
return nil, nil, err
}
return source{
sourceUrl: url,
content: text,
}, screenshot, nil
return newDocument(b.pw, b.browser, page)
}
func (b playWrightBrowser) Close() error {