From cbd6682257aa6b987a01276e2bbc81ab05533ba8 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sat, 7 Dec 2024 03:53:46 -0500 Subject: [PATCH] initial commit --- article.go | 14 ++++ browser.go | 12 +++ cookiejar.go | 20 +++++ extractor.go | 7 ++ go.mod | 18 ++++ playwright.go | 218 +++++++++++++++++++++++++++++++++++++++++++++++++ processor.go | 7 ++ readability.go | 45 ++++++++++ source.go | 29 +++++++ 9 files changed, 370 insertions(+) create mode 100644 article.go create mode 100644 browser.go create mode 100644 cookiejar.go create mode 100644 extractor.go create mode 100644 go.mod create mode 100644 playwright.go create mode 100644 processor.go create mode 100644 readability.go create mode 100644 source.go diff --git a/article.go b/article.go new file mode 100644 index 0000000..feda664 --- /dev/null +++ b/article.go @@ -0,0 +1,14 @@ +package extractor + +type Article struct { + Title string + Content string + TextContent string + Length int + Excerpt string + Byline string + Dir string + SiteName string + Lang string + PublishedTime string +} diff --git a/browser.go b/browser.go new file mode 100644 index 0000000..30c61b2 --- /dev/null +++ b/browser.go @@ -0,0 +1,12 @@ +package extractor + +import ( + "context" + "io" +) + +type Browser interface { + io.Closer + + Open(ctx context.Context, url string) (Source, error) +} diff --git a/cookiejar.go b/cookiejar.go new file mode 100644 index 0000000..7f8d31d --- /dev/null +++ b/cookiejar.go @@ -0,0 +1,20 @@ +package extractor + +import ( + "time" +) + +type Cookie struct { + Name string + Value string + Domain string + Path string + Expires time.Time + Secure bool + HttpOnly bool +} +type CookieJar interface { + GetAll() ([]Cookie, error) + Set(cookie Cookie) error + Delete(cookie Cookie) error +} diff --git a/extractor.go b/extractor.go new file mode 100644 index 0000000..a112fe6 --- /dev/null +++ b/extractor.go @@ -0,0 +1,7 @@ +package extractor + +import "context" + +type Extractor interface { + Extract(ctx context.Context, src Source) (Article, error) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..7190b00 --- /dev/null +++ b/go.mod @@ -0,0 +1,18 @@ +module gitea.stevedudenhoeffer.com/steve/go-extractor + +go 1.23.2 + +require github.com/playwright-community/playwright-go v0.4802.0 + +require ( + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect + github.com/deckarep/golang-set/v2 v2.6.0 // indirect + github.com/go-jose/go-jose/v3 v3.0.3 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f // indirect + github.com/go-stack/stack v1.8.1 // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect + golang.org/x/net v0.32.0 // indirect + golang.org/x/text v0.21.0 // indirect +) diff --git a/playwright.go b/playwright.go new file mode 100644 index 0000000..e67db5e --- /dev/null +++ b/playwright.go @@ -0,0 +1,218 @@ +package extractor + +import ( + "context" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/playwright-community/playwright-go" +) + +type playWrightBrowser struct { + browser playwright.Browser + ctx playwright.BrowserContext + userAgent string + timeout time.Duration + cookieJar CookieJar +} + +var _ Browser = playWrightBrowser{} + +type PlayWrightBrowserSelection string + +var ( + ErrInvalidBrowserSelection = errors.New("invalid browser selection") + ErrInvalidStatusCode = errors.New("invalid status code") +) + +const ( + PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium" + PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox" + PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" +) + +type PlayWrightBrowserOptions struct { + UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" + Browser PlayWrightBrowserSelection // If unset defaults to Chromium + Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout + + // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the + // browser into the cookie jar for each request. + CookieJar +} + +func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { + return playwright.OptionalCookie{ + Name: cookie.Name, + Value: cookie.Value, + Domain: playwright.String(cookie.Domain), + Path: playwright.String(cookie.Path), + Expires: playwright.Float(float64(cookie.Expires.Unix())), + HttpOnly: playwright.Bool(cookie.HttpOnly), + } +} + +func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { + return Cookie{ + Name: cookie.Name, + Value: cookie.Value, + Domain: cookie.Domain, + Path: cookie.Path, + Expires: time.Unix(int64(cookie.Expires), 0), + HttpOnly: cookie.HttpOnly, + } +} + +func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { + var thirtySeconds = 30 * time.Second + opt := PlayWrightBrowserOptions{ + UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", + Browser: PlayWrightBrowserSelectionChromium, + Timeout: &thirtySeconds, + } + + for _, o := range opts { + if o.UserAgent != "" { + opt.UserAgent = o.UserAgent + } + if o.Browser != "" { + opt.Browser = o.Browser + } + if o.Timeout != nil { + opt.Timeout = o.Timeout + } + if o.CookieJar != nil { + opt.CookieJar = o.CookieJar + } + } + + err := playwright.Install() + if err != nil { + return nil, err + } + + pw, err := playwright.Run() + if err != nil { + return nil, err + } + + var bt playwright.BrowserType + + switch opt.Browser { + case PlayWrightBrowserSelectionChromium: + bt = pw.Chromium + + case PlayWrightBrowserSelectionFirefox: + bt = pw.Firefox + + case PlayWrightBrowserSelectionWebKit: + bt = pw.WebKit + + default: + return nil, ErrInvalidBrowserSelection + } + + browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ + Headless: playwright.Bool(true), + }) + if err != nil { + return nil, err + } + + c, err := browser.NewContext(playwright.BrowserNewContextOptions{ + UserAgent: playwright.String(opt.UserAgent), + }) + if err != nil { + return nil, err + } + + if opt.CookieJar != nil { + cookies, err := opt.CookieJar.GetAll() + if err != nil { + return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err) + } + + pwCookies := make([]playwright.OptionalCookie, len(cookies)) + + for i, cookie := range cookies { + pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie) + } + + err = c.AddCookies(pwCookies) + + if err != nil { + return nil, fmt.Errorf("error adding cookies to browser: %w", err) + } + } + + return playWrightBrowser{ + browser: browser, + userAgent: opt.UserAgent, + timeout: *opt.Timeout, + cookieJar: opt.CookieJar, + ctx: c, + }, nil +} + +func (b playWrightBrowser) Open(_ context.Context, url string) (Source, error) { + if b.userAgent == "" { + b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" + } + + page, err := b.ctx.NewPage() + if err != nil { + return nil, err + } + + opts := playwright.PageGotoOptions{ + WaitUntil: playwright.WaitUntilStateLoad, + } + + if b.timeout > 0 { + var ms = float64(b.timeout.Milliseconds()) + opts.Timeout = &ms + } + resp, err := page.Goto(url, opts) + if err != nil { + return nil, err + } + + slog.Info("response", "response", resp) + + if resp.Status() != 200 { + return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status) + } + + text, err := resp.Text() + + if err != nil { + return nil, err + } + + if b.cookieJar != nil { + cookies, err := page.Context().Cookies(page.URL()) + + for _, cookie := range cookies { + // TODO: add support for deleting cookies from the jar which are deleted in the browser + err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) + + if err != nil { + return nil, fmt.Errorf("error setting cookie in cookie jar: %w", err) + } + } + } + + return source{ + sourceUrl: url, + content: text, + }, nil +} + +func (b playWrightBrowser) Close() error { + return errors.Join( + b.ctx.Close(), + b.browser.Close(), + ) +} diff --git a/processor.go b/processor.go new file mode 100644 index 0000000..4f5e655 --- /dev/null +++ b/processor.go @@ -0,0 +1,7 @@ +package extractor + +import "context" + +type Processor interface { + Process(ctx context.Context, src source) (source, error) +} diff --git a/readability.go b/readability.go new file mode 100644 index 0000000..68ff195 --- /dev/null +++ b/readability.go @@ -0,0 +1,45 @@ +package extractor + +import ( + "context" + "net/url" + + "github.com/go-shiori/go-readability" +) + +type Readability struct { + Extractor +} + +var _ Extractor = Readability{} + +func (r Readability) Extract(_ context.Context, src Source) (Article, error) { + u, err := url.Parse(src.URL()) + + if err != nil { + return Article{}, err + } + a, err := readability.FromReader(src.Reader(), u) + + if err != nil { + return Article{}, err + } + + pubTime := "" + + if a.PublishedTime != nil { + pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z") + } + return Article{ + Title: a.Title, + Content: a.Content, + TextContent: a.TextContent, + Length: a.Length, + Excerpt: a.Excerpt, + Byline: a.Byline, + SiteName: a.SiteName, + Lang: a.Language, + PublishedTime: pubTime, + }, nil + +} diff --git a/source.go b/source.go new file mode 100644 index 0000000..21fc0a9 --- /dev/null +++ b/source.go @@ -0,0 +1,29 @@ +package extractor + +import ( + "io" + "strings" +) + +type Source interface { + URL() string + String() string + Reader() io.Reader +} + +type source struct { + sourceUrl string + content string +} + +func (s source) URL() string { + return s.sourceUrl +} + +func (s source) String() string { + return s.content +} + +func (s source) Reader() io.Reader { + return strings.NewReader(s.content) +}