initial commit

This commit is contained in:
Steve Dudenhoeffer 2024-12-07 03:53:46 -05:00
commit cbd6682257
9 changed files with 370 additions and 0 deletions

14
article.go Normal file
View File

@ -0,0 +1,14 @@
package extractor
type Article struct {
Title string
Content string
TextContent string
Length int
Excerpt string
Byline string
Dir string
SiteName string
Lang string
PublishedTime string
}

12
browser.go Normal file
View File

@ -0,0 +1,12 @@
package extractor
import (
"context"
"io"
)
type Browser interface {
io.Closer
Open(ctx context.Context, url string) (Source, error)
}

20
cookiejar.go Normal file
View File

@ -0,0 +1,20 @@
package extractor
import (
"time"
)
type Cookie struct {
Name string
Value string
Domain string
Path string
Expires time.Time
Secure bool
HttpOnly bool
}
type CookieJar interface {
GetAll() ([]Cookie, error)
Set(cookie Cookie) error
Delete(cookie Cookie) error
}

7
extractor.go Normal file
View File

@ -0,0 +1,7 @@
package extractor
import "context"
type Extractor interface {
Extract(ctx context.Context, src Source) (Article, error)
}

18
go.mod Normal file
View File

@ -0,0 +1,18 @@
module gitea.stevedudenhoeffer.com/steve/go-extractor
go 1.23.2
require github.com/playwright-community/playwright-go v0.4802.0
require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
golang.org/x/net v0.32.0 // indirect
golang.org/x/text v0.21.0 // indirect
)

218
playwright.go Normal file
View File

@ -0,0 +1,218 @@
package extractor
import (
"context"
"errors"
"fmt"
"log/slog"
"time"
"github.com/playwright-community/playwright-go"
)
type playWrightBrowser struct {
browser playwright.Browser
ctx playwright.BrowserContext
userAgent string
timeout time.Duration
cookieJar CookieJar
}
var _ Browser = playWrightBrowser{}
type PlayWrightBrowserSelection string
var (
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
ErrInvalidStatusCode = errors.New("invalid status code")
)
const (
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
)
type PlayWrightBrowserOptions struct {
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
Browser PlayWrightBrowserSelection // If unset defaults to Chromium
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
// browser into the cookie jar for each request.
CookieJar
}
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
return playwright.OptionalCookie{
Name: cookie.Name,
Value: cookie.Value,
Domain: playwright.String(cookie.Domain),
Path: playwright.String(cookie.Path),
Expires: playwright.Float(float64(cookie.Expires.Unix())),
HttpOnly: playwright.Bool(cookie.HttpOnly),
}
}
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
return Cookie{
Name: cookie.Name,
Value: cookie.Value,
Domain: cookie.Domain,
Path: cookie.Path,
Expires: time.Unix(int64(cookie.Expires), 0),
HttpOnly: cookie.HttpOnly,
}
}
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
var thirtySeconds = 30 * time.Second
opt := PlayWrightBrowserOptions{
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3",
Browser: PlayWrightBrowserSelectionChromium,
Timeout: &thirtySeconds,
}
for _, o := range opts {
if o.UserAgent != "" {
opt.UserAgent = o.UserAgent
}
if o.Browser != "" {
opt.Browser = o.Browser
}
if o.Timeout != nil {
opt.Timeout = o.Timeout
}
if o.CookieJar != nil {
opt.CookieJar = o.CookieJar
}
}
err := playwright.Install()
if err != nil {
return nil, err
}
pw, err := playwright.Run()
if err != nil {
return nil, err
}
var bt playwright.BrowserType
switch opt.Browser {
case PlayWrightBrowserSelectionChromium:
bt = pw.Chromium
case PlayWrightBrowserSelectionFirefox:
bt = pw.Firefox
case PlayWrightBrowserSelectionWebKit:
bt = pw.WebKit
default:
return nil, ErrInvalidBrowserSelection
}
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(true),
})
if err != nil {
return nil, err
}
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
UserAgent: playwright.String(opt.UserAgent),
})
if err != nil {
return nil, err
}
if opt.CookieJar != nil {
cookies, err := opt.CookieJar.GetAll()
if err != nil {
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
}
pwCookies := make([]playwright.OptionalCookie, len(cookies))
for i, cookie := range cookies {
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
}
err = c.AddCookies(pwCookies)
if err != nil {
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
}
}
return playWrightBrowser{
browser: browser,
userAgent: opt.UserAgent,
timeout: *opt.Timeout,
cookieJar: opt.CookieJar,
ctx: c,
}, nil
}
func (b playWrightBrowser) Open(_ context.Context, url string) (Source, error) {
if b.userAgent == "" {
b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
}
page, err := b.ctx.NewPage()
if err != nil {
return nil, err
}
opts := playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateLoad,
}
if b.timeout > 0 {
var ms = float64(b.timeout.Milliseconds())
opts.Timeout = &ms
}
resp, err := page.Goto(url, opts)
if err != nil {
return nil, err
}
slog.Info("response", "response", resp)
if resp.Status() != 200 {
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status)
}
text, err := resp.Text()
if err != nil {
return nil, err
}
if b.cookieJar != nil {
cookies, err := page.Context().Cookies(page.URL())
for _, cookie := range cookies {
// TODO: add support for deleting cookies from the jar which are deleted in the browser
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
if err != nil {
return nil, fmt.Errorf("error setting cookie in cookie jar: %w", err)
}
}
}
return source{
sourceUrl: url,
content: text,
}, nil
}
func (b playWrightBrowser) Close() error {
return errors.Join(
b.ctx.Close(),
b.browser.Close(),
)
}

7
processor.go Normal file
View File

@ -0,0 +1,7 @@
package extractor
import "context"
type Processor interface {
Process(ctx context.Context, src source) (source, error)
}

45
readability.go Normal file
View File

@ -0,0 +1,45 @@
package extractor
import (
"context"
"net/url"
"github.com/go-shiori/go-readability"
)
type Readability struct {
Extractor
}
var _ Extractor = Readability{}
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
u, err := url.Parse(src.URL())
if err != nil {
return Article{}, err
}
a, err := readability.FromReader(src.Reader(), u)
if err != nil {
return Article{}, err
}
pubTime := ""
if a.PublishedTime != nil {
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
}
return Article{
Title: a.Title,
Content: a.Content,
TextContent: a.TextContent,
Length: a.Length,
Excerpt: a.Excerpt,
Byline: a.Byline,
SiteName: a.SiteName,
Lang: a.Language,
PublishedTime: pubTime,
}, nil
}

29
source.go Normal file
View File

@ -0,0 +1,29 @@
package extractor
import (
"io"
"strings"
)
type Source interface {
URL() string
String() string
Reader() io.Reader
}
type source struct {
sourceUrl string
content string
}
func (s source) URL() string {
return s.sourceUrl
}
func (s source) String() string {
return s.content
}
func (s source) Reader() io.Reader {
return strings.NewReader(s.content)
}