2024-12-07 03:53:46 -05:00
package extractor
import (
"context"
"errors"
"fmt"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type playWrightBrowser struct {
2024-12-17 23:16:13 -05:00
pw * playwright . Playwright
2024-12-07 03:53:46 -05:00
browser playwright . Browser
ctx playwright . BrowserContext
userAgent string
timeout time . Duration
cookieJar CookieJar
}
var _ Browser = playWrightBrowser { }
type PlayWrightBrowserSelection string
var (
ErrInvalidBrowserSelection = errors . New ( "invalid browser selection" )
ErrInvalidStatusCode = errors . New ( "invalid status code" )
)
const (
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
)
type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
2024-12-09 13:51:00 -05:00
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
2024-12-07 03:53:46 -05:00
Timeout * time . Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
// browser into the cookie jar for each request.
CookieJar
}
func cookieToPlaywrightOptionalCookie ( cookie Cookie ) playwright . OptionalCookie {
return playwright . OptionalCookie {
Name : cookie . Name ,
Value : cookie . Value ,
Domain : playwright . String ( cookie . Domain ) ,
Path : playwright . String ( cookie . Path ) ,
Expires : playwright . Float ( float64 ( cookie . Expires . Unix ( ) ) ) ,
HttpOnly : playwright . Bool ( cookie . HttpOnly ) ,
}
}
func playwrightCookieToCookie ( cookie playwright . Cookie ) Cookie {
return Cookie {
Name : cookie . Name ,
Value : cookie . Value ,
Domain : cookie . Domain ,
Path : cookie . Path ,
Expires : time . Unix ( int64 ( cookie . Expires ) , 0 ) ,
HttpOnly : cookie . HttpOnly ,
}
}
func NewPlayWrightBrowser ( opts ... PlayWrightBrowserOptions ) ( Browser , error ) {
var thirtySeconds = 30 * time . Second
opt := PlayWrightBrowserOptions {
UserAgent : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" ,
2024-12-09 13:51:00 -05:00
Browser : PlayWrightBrowserSelectionFirefox ,
2024-12-07 03:53:46 -05:00
Timeout : & thirtySeconds ,
}
for _ , o := range opts {
if o . UserAgent != "" {
opt . UserAgent = o . UserAgent
}
if o . Browser != "" {
opt . Browser = o . Browser
}
if o . Timeout != nil {
opt . Timeout = o . Timeout
}
if o . CookieJar != nil {
opt . CookieJar = o . CookieJar
}
}
err := playwright . Install ( )
if err != nil {
return nil , err
}
pw , err := playwright . Run ( )
if err != nil {
return nil , err
}
var bt playwright . BrowserType
switch opt . Browser {
case PlayWrightBrowserSelectionChromium :
bt = pw . Chromium
case PlayWrightBrowserSelectionFirefox :
bt = pw . Firefox
case PlayWrightBrowserSelectionWebKit :
bt = pw . WebKit
default :
return nil , ErrInvalidBrowserSelection
}
browser , err := bt . Launch ( playwright . BrowserTypeLaunchOptions {
Headless : playwright . Bool ( true ) ,
} )
if err != nil {
return nil , err
}
c , err := browser . NewContext ( playwright . BrowserNewContextOptions {
UserAgent : playwright . String ( opt . UserAgent ) ,
} )
if err != nil {
return nil , err
}
if opt . CookieJar != nil {
cookies , err := opt . CookieJar . GetAll ( )
if err != nil {
return nil , fmt . Errorf ( "error getting cookies from cookie jar: %w" , err )
}
pwCookies := make ( [ ] playwright . OptionalCookie , len ( cookies ) )
for i , cookie := range cookies {
pwCookies [ i ] = cookieToPlaywrightOptionalCookie ( cookie )
}
err = c . AddCookies ( pwCookies )
if err != nil {
return nil , fmt . Errorf ( "error adding cookies to browser: %w" , err )
}
}
return playWrightBrowser {
2024-12-17 23:16:13 -05:00
pw : pw ,
2024-12-07 03:53:46 -05:00
browser : browser ,
userAgent : opt . UserAgent ,
timeout : * opt . Timeout ,
cookieJar : opt . CookieJar ,
ctx : c ,
} , nil
}
2024-12-09 13:51:00 -05:00
func ( b playWrightBrowser ) updateCookies ( _ context . Context , page playwright . Page ) error {
if b . cookieJar != nil {
cookies , err := page . Context ( ) . Cookies ( page . URL ( ) )
for _ , cookie := range cookies {
// TODO: add support for deleting cookies from the jar which are deleted in the browser
err = b . cookieJar . Set ( playwrightCookieToCookie ( cookie ) )
if err != nil {
return fmt . Errorf ( "error setting cookie in cookie jar: %w" , err )
}
}
2024-12-07 03:53:46 -05:00
}
2024-12-09 13:51:00 -05:00
return nil
}
func ( b playWrightBrowser ) openPage ( _ context . Context , target string ) ( playwright . Page , error ) {
2024-12-07 03:53:46 -05:00
page , err := b . ctx . NewPage ( )
if err != nil {
return nil , err
}
opts := playwright . PageGotoOptions {
WaitUntil : playwright . WaitUntilStateLoad ,
}
if b . timeout > 0 {
var ms = float64 ( b . timeout . Milliseconds ( ) )
opts . Timeout = & ms
}
2024-12-09 13:51:00 -05:00
resp , err := page . Goto ( target , opts )
2024-12-07 03:53:46 -05:00
if err != nil {
return nil , err
}
2024-12-17 23:16:13 -05:00
slog . Info ( "opened document" , "url" , target , "status" , resp . Status ( ) , "request" , resp . Request ( ) )
2024-12-07 03:53:46 -05:00
if resp . Status ( ) != 200 {
2024-12-09 13:51:00 -05:00
slog . Info ( "invalid status code" , "status" , resp . Status ( ) , "request" , resp . Request ( ) )
return nil , fmt . Errorf ( "%w: %d" , ErrInvalidStatusCode , resp . Status ( ) )
2024-12-07 03:53:46 -05:00
}
2024-12-09 13:51:00 -05:00
return page , nil
}
2024-12-17 23:16:13 -05:00
func ( b playWrightBrowser ) Open ( ctx context . Context , url string ) ( Document , error ) {
2024-12-07 03:53:46 -05:00
2024-12-09 13:51:00 -05:00
page , err := b . openPage ( ctx , url )
2024-12-07 03:53:46 -05:00
if err != nil {
return nil , err
}
2024-12-09 13:51:00 -05:00
defer page . Close ( )
2024-12-07 03:53:46 -05:00
2024-12-09 13:51:00 -05:00
err = b . updateCookies ( ctx , page )
if err != nil {
return nil , err
}
2024-12-07 03:53:46 -05:00
2024-12-17 23:16:13 -05:00
return newDocument ( b . pw , b . browser , page )
2024-12-07 03:53:46 -05:00
}
func ( b playWrightBrowser ) Close ( ) error {
return errors . Join (
b . ctx . Close ( ) ,
b . browser . Close ( ) ,
)
}