commit cbd6682257aa6b987a01276e2bbc81ab05533ba8 Author: Steve Dudenhoeffer Date: Sat Dec 7 03:53:46 2024 -0500 initial commit diff --git a/article.go b/article.go new file mode 100644 index 0000000..feda664 --- /dev/null +++ b/article.go @@ -0,0 +1,14 @@ +package extractor + +type Article struct { + Title string + Content string + TextContent string + Length int + Excerpt string + Byline string + Dir string + SiteName string + Lang string + PublishedTime string +} diff --git a/browser.go b/browser.go new file mode 100644 index 0000000..30c61b2 --- /dev/null +++ b/browser.go @@ -0,0 +1,12 @@ +package extractor + +import ( + "context" + "io" +) + +type Browser interface { + io.Closer + + Open(ctx context.Context, url string) (Source, error) +} diff --git a/cookiejar.go b/cookiejar.go new file mode 100644 index 0000000..7f8d31d --- /dev/null +++ b/cookiejar.go @@ -0,0 +1,20 @@ +package extractor + +import ( + "time" +) + +type Cookie struct { + Name string + Value string + Domain string + Path string + Expires time.Time + Secure bool + HttpOnly bool +} +type CookieJar interface { + GetAll() ([]Cookie, error) + Set(cookie Cookie) error + Delete(cookie Cookie) error +} diff --git a/extractor.go b/extractor.go new file mode 100644 index 0000000..a112fe6 --- /dev/null +++ b/extractor.go @@ -0,0 +1,7 @@ +package extractor + +import "context" + +type Extractor interface { + Extract(ctx context.Context, src Source) (Article, error) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..7190b00 --- /dev/null +++ b/go.mod @@ -0,0 +1,18 @@ +module gitea.stevedudenhoeffer.com/steve/go-extractor + +go 1.23.2 + +require github.com/playwright-community/playwright-go v0.4802.0 + +require ( + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect + github.com/deckarep/golang-set/v2 v2.6.0 // indirect + github.com/go-jose/go-jose/v3 v3.0.3 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f // indirect + github.com/go-stack/stack v1.8.1 // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect + golang.org/x/net v0.32.0 // indirect + golang.org/x/text v0.21.0 // indirect +) diff --git a/playwright.go b/playwright.go new file mode 100644 index 0000000..e67db5e --- /dev/null +++ b/playwright.go @@ -0,0 +1,218 @@ +package extractor + +import ( + "context" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/playwright-community/playwright-go" +) + +type playWrightBrowser struct { + browser playwright.Browser + ctx playwright.BrowserContext + userAgent string + timeout time.Duration + cookieJar CookieJar +} + +var _ Browser = playWrightBrowser{} + +type PlayWrightBrowserSelection string + +var ( + ErrInvalidBrowserSelection = errors.New("invalid browser selection") + ErrInvalidStatusCode = errors.New("invalid status code") +) + +const ( + PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium" + PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox" + PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit" +) + +type PlayWrightBrowserOptions struct { + UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" + Browser PlayWrightBrowserSelection // If unset defaults to Chromium + Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout + + // CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the + // browser into the cookie jar for each request. + CookieJar +} + +func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie { + return playwright.OptionalCookie{ + Name: cookie.Name, + Value: cookie.Value, + Domain: playwright.String(cookie.Domain), + Path: playwright.String(cookie.Path), + Expires: playwright.Float(float64(cookie.Expires.Unix())), + HttpOnly: playwright.Bool(cookie.HttpOnly), + } +} + +func playwrightCookieToCookie(cookie playwright.Cookie) Cookie { + return Cookie{ + Name: cookie.Name, + Value: cookie.Value, + Domain: cookie.Domain, + Path: cookie.Path, + Expires: time.Unix(int64(cookie.Expires), 0), + HttpOnly: cookie.HttpOnly, + } +} + +func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) { + var thirtySeconds = 30 * time.Second + opt := PlayWrightBrowserOptions{ + UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3", + Browser: PlayWrightBrowserSelectionChromium, + Timeout: &thirtySeconds, + } + + for _, o := range opts { + if o.UserAgent != "" { + opt.UserAgent = o.UserAgent + } + if o.Browser != "" { + opt.Browser = o.Browser + } + if o.Timeout != nil { + opt.Timeout = o.Timeout + } + if o.CookieJar != nil { + opt.CookieJar = o.CookieJar + } + } + + err := playwright.Install() + if err != nil { + return nil, err + } + + pw, err := playwright.Run() + if err != nil { + return nil, err + } + + var bt playwright.BrowserType + + switch opt.Browser { + case PlayWrightBrowserSelectionChromium: + bt = pw.Chromium + + case PlayWrightBrowserSelectionFirefox: + bt = pw.Firefox + + case PlayWrightBrowserSelectionWebKit: + bt = pw.WebKit + + default: + return nil, ErrInvalidBrowserSelection + } + + browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{ + Headless: playwright.Bool(true), + }) + if err != nil { + return nil, err + } + + c, err := browser.NewContext(playwright.BrowserNewContextOptions{ + UserAgent: playwright.String(opt.UserAgent), + }) + if err != nil { + return nil, err + } + + if opt.CookieJar != nil { + cookies, err := opt.CookieJar.GetAll() + if err != nil { + return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err) + } + + pwCookies := make([]playwright.OptionalCookie, len(cookies)) + + for i, cookie := range cookies { + pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie) + } + + err = c.AddCookies(pwCookies) + + if err != nil { + return nil, fmt.Errorf("error adding cookies to browser: %w", err) + } + } + + return playWrightBrowser{ + browser: browser, + userAgent: opt.UserAgent, + timeout: *opt.Timeout, + cookieJar: opt.CookieJar, + ctx: c, + }, nil +} + +func (b playWrightBrowser) Open(_ context.Context, url string) (Source, error) { + if b.userAgent == "" { + b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3" + } + + page, err := b.ctx.NewPage() + if err != nil { + return nil, err + } + + opts := playwright.PageGotoOptions{ + WaitUntil: playwright.WaitUntilStateLoad, + } + + if b.timeout > 0 { + var ms = float64(b.timeout.Milliseconds()) + opts.Timeout = &ms + } + resp, err := page.Goto(url, opts) + if err != nil { + return nil, err + } + + slog.Info("response", "response", resp) + + if resp.Status() != 200 { + return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status) + } + + text, err := resp.Text() + + if err != nil { + return nil, err + } + + if b.cookieJar != nil { + cookies, err := page.Context().Cookies(page.URL()) + + for _, cookie := range cookies { + // TODO: add support for deleting cookies from the jar which are deleted in the browser + err = b.cookieJar.Set(playwrightCookieToCookie(cookie)) + + if err != nil { + return nil, fmt.Errorf("error setting cookie in cookie jar: %w", err) + } + } + } + + return source{ + sourceUrl: url, + content: text, + }, nil +} + +func (b playWrightBrowser) Close() error { + return errors.Join( + b.ctx.Close(), + b.browser.Close(), + ) +} diff --git a/processor.go b/processor.go new file mode 100644 index 0000000..4f5e655 --- /dev/null +++ b/processor.go @@ -0,0 +1,7 @@ +package extractor + +import "context" + +type Processor interface { + Process(ctx context.Context, src source) (source, error) +} diff --git a/readability.go b/readability.go new file mode 100644 index 0000000..68ff195 --- /dev/null +++ b/readability.go @@ -0,0 +1,45 @@ +package extractor + +import ( + "context" + "net/url" + + "github.com/go-shiori/go-readability" +) + +type Readability struct { + Extractor +} + +var _ Extractor = Readability{} + +func (r Readability) Extract(_ context.Context, src Source) (Article, error) { + u, err := url.Parse(src.URL()) + + if err != nil { + return Article{}, err + } + a, err := readability.FromReader(src.Reader(), u) + + if err != nil { + return Article{}, err + } + + pubTime := "" + + if a.PublishedTime != nil { + pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z") + } + return Article{ + Title: a.Title, + Content: a.Content, + TextContent: a.TextContent, + Length: a.Length, + Excerpt: a.Excerpt, + Byline: a.Byline, + SiteName: a.SiteName, + Lang: a.Language, + PublishedTime: pubTime, + }, nil + +} diff --git a/source.go b/source.go new file mode 100644 index 0000000..21fc0a9 --- /dev/null +++ b/source.go @@ -0,0 +1,29 @@ +package extractor + +import ( + "io" + "strings" +) + +type Source interface { + URL() string + String() string + Reader() io.Reader +} + +type source struct { + sourceUrl string + content string +} + +func (s source) URL() string { + return s.sourceUrl +} + +func (s source) String() string { + return s.content +} + +func (s source) Reader() io.Reader { + return strings.NewReader(s.content) +}