changed browser api to return pages that can be acted on, not strictly contents
This commit is contained in:
parent
23334991b1
commit
5e924eb3f9
17
browser.go
17
browser.go
@ -5,23 +5,8 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ScreenshotStyle string
|
|
||||||
|
|
||||||
const (
|
|
||||||
ScreenshotStyleFullPage ScreenshotStyle = "full"
|
|
||||||
ScreenshotStyleViewport ScreenshotStyle = "viewport"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ScreenshotOptions struct {
|
|
||||||
Style ScreenshotStyle
|
|
||||||
Width int
|
|
||||||
Height int
|
|
||||||
}
|
|
||||||
|
|
||||||
type Browser interface {
|
type Browser interface {
|
||||||
io.Closer
|
io.Closer
|
||||||
|
|
||||||
Open(ctx context.Context, url string) (Source, error)
|
Open(ctx context.Context, url string) (Document, error)
|
||||||
Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error)
|
|
||||||
OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error)
|
|
||||||
}
|
}
|
||||||
|
18
cookiejar.go
18
cookiejar.go
@ -18,3 +18,21 @@ type CookieJar interface {
|
|||||||
Set(cookie Cookie) error
|
Set(cookie Cookie) error
|
||||||
Delete(cookie Cookie) error
|
Delete(cookie Cookie) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ReadOnlyCookieJar is a wrapper for CookieJar that allows only read operations on cookies, but all
|
||||||
|
// write operations are no-ops.
|
||||||
|
type ReadOnlyCookieJar struct {
|
||||||
|
Jar CookieJar
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r ReadOnlyCookieJar) GetAll() ([]Cookie, error) {
|
||||||
|
return r.Jar.GetAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r ReadOnlyCookieJar) Set(_ Cookie) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r ReadOnlyCookieJar) Delete(_ Cookie) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
98
document.go
Normal file
98
document.go
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
|
||||||
|
"github.com/playwright-community/playwright-go"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Document interface {
|
||||||
|
io.Closer
|
||||||
|
|
||||||
|
Content() (string, error)
|
||||||
|
Text() (string, error)
|
||||||
|
Screenshot() ([]byte, error)
|
||||||
|
|
||||||
|
Select(selector string) Documents
|
||||||
|
SelectFirst(selector string) Document
|
||||||
|
|
||||||
|
ForEach(selector string, fn func(Document) error) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type document struct {
|
||||||
|
pw *playwright.Playwright
|
||||||
|
browser playwright.Browser
|
||||||
|
page playwright.Page
|
||||||
|
root playwright.ElementHandle
|
||||||
|
locator playwright.Locator
|
||||||
|
}
|
||||||
|
|
||||||
|
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
||||||
|
root, err := page.QuerySelector("html")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
root2 := page.Locator("html")
|
||||||
|
return document{
|
||||||
|
pw: pw,
|
||||||
|
browser: browser,
|
||||||
|
page: page,
|
||||||
|
locator: root2,
|
||||||
|
root: root,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
func (p document) Close() error {
|
||||||
|
return p.page.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p document) Content() (string, error) {
|
||||||
|
return p.locator.TextContent()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p document) Text() (string, error) {
|
||||||
|
return p.locator.InnerText()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p document) Screenshot() ([]byte, error) {
|
||||||
|
return p.locator.Screenshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d document) Select(selector string) Documents {
|
||||||
|
|
||||||
|
elements, err := d.locator.Locator(selector).All()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
res := make(Documents, len(elements))
|
||||||
|
for i, el := range elements {
|
||||||
|
res[i] = document{
|
||||||
|
pw: d.pw,
|
||||||
|
browser: d.browser,
|
||||||
|
page: d.page,
|
||||||
|
locator: el,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d document) SelectFirst(selector string) Document {
|
||||||
|
return d.Select(selector)[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d document) ForEach(selector string, fn func(Document) error) error {
|
||||||
|
|
||||||
|
e := d.Select(selector)
|
||||||
|
|
||||||
|
for _, el := range e {
|
||||||
|
err := fn(el)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
32
documents.go
Normal file
32
documents.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
type Documents []Document
|
||||||
|
|
||||||
|
func (d Documents) Select(selector string) Documents {
|
||||||
|
var res Documents
|
||||||
|
|
||||||
|
for _, doc := range d {
|
||||||
|
res = append(res, doc.Select(selector)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d Documents) First() Document {
|
||||||
|
return d[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d Documents) ExtractText() ([]string, error) {
|
||||||
|
var res []string
|
||||||
|
|
||||||
|
for _, doc := range d {
|
||||||
|
text, err := doc.Text()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
res = append(res, text)
|
||||||
|
}
|
||||||
|
|
||||||
|
return res, nil
|
||||||
|
}
|
@ -11,6 +11,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type playWrightBrowser struct {
|
type playWrightBrowser struct {
|
||||||
|
pw *playwright.Playwright
|
||||||
browser playwright.Browser
|
browser playwright.Browser
|
||||||
ctx playwright.BrowserContext
|
ctx playwright.BrowserContext
|
||||||
userAgent string
|
userAgent string
|
||||||
@ -148,6 +149,7 @@ func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return playWrightBrowser{
|
return playWrightBrowser{
|
||||||
|
pw: pw,
|
||||||
browser: browser,
|
browser: browser,
|
||||||
userAgent: opt.UserAgent,
|
userAgent: opt.UserAgent,
|
||||||
timeout: *opt.Timeout,
|
timeout: *opt.Timeout,
|
||||||
@ -192,7 +194,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("opened page", "url", target, "status", resp.Status(), "request", resp.Request())
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||||
|
|
||||||
if resp.Status() != 200 {
|
if resp.Status() != 200 {
|
||||||
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
slog.Info("invalid status code", "status", resp.Status(), "request", resp.Request())
|
||||||
@ -202,7 +204,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string) (playwrigh
|
|||||||
return page, nil
|
return page, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error) {
|
func (b playWrightBrowser) Open(ctx context.Context, url string) (Document, error) {
|
||||||
|
|
||||||
page, err := b.openPage(ctx, url)
|
page, err := b.openPage(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -210,86 +212,12 @@ func (b playWrightBrowser) Open(ctx context.Context, url string) (Source, error)
|
|||||||
}
|
}
|
||||||
defer page.Close()
|
defer page.Close()
|
||||||
|
|
||||||
text, err := page.Content()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = b.updateCookies(ctx, page)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return source{
|
|
||||||
sourceUrl: url,
|
|
||||||
content: text,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b playWrightBrowser) getScreenshot(_ context.Context, page playwright.Page, opts ScreenshotOptions) ([]byte, error) {
|
|
||||||
var pwOpts playwright.PageScreenshotOptions
|
|
||||||
|
|
||||||
if opts.Style == "" {
|
|
||||||
opts.Style = ScreenshotStyleFullPage
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Style == ScreenshotStyleFullPage {
|
|
||||||
pwOpts.FullPage = playwright.Bool(true)
|
|
||||||
} else if opts.Style == ScreenshotStyleViewport {
|
|
||||||
pwOpts.FullPage = playwright.Bool(false)
|
|
||||||
|
|
||||||
if opts.Width > 0 || opts.Height > 0 {
|
|
||||||
pwOpts.Clip = &playwright.Rect{
|
|
||||||
Width: float64(opts.Width),
|
|
||||||
Height: float64(opts.Height),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return page.Screenshot(pwOpts)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b playWrightBrowser) Screenshot(ctx context.Context, url string, opts ScreenshotOptions) ([]byte, error) {
|
|
||||||
page, err := b.openPage(ctx, url)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer page.Close()
|
|
||||||
|
|
||||||
err = b.updateCookies(ctx, page)
|
err = b.updateCookies(ctx, page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return b.getScreenshot(ctx, page, opts)
|
return newDocument(b.pw, b.browser, page)
|
||||||
}
|
|
||||||
|
|
||||||
func (b playWrightBrowser) OpenAndScreenshot(ctx context.Context, url string, opts ScreenshotOptions) (Source, []byte, error) {
|
|
||||||
page, err := b.openPage(ctx, url)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
defer page.Close()
|
|
||||||
|
|
||||||
text, err := page.Content()
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
screenshot, err := b.getScreenshot(ctx, page, opts)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = b.updateCookies(ctx, page)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return source{
|
|
||||||
sourceUrl: url,
|
|
||||||
content: text,
|
|
||||||
}, screenshot, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b playWrightBrowser) Close() error {
|
func (b playWrightBrowser) Close() error {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user