initial commit
This commit is contained in:
commit
cbd6682257
14
article.go
Normal file
14
article.go
Normal file
@ -0,0 +1,14 @@
|
||||
package extractor
|
||||
|
||||
type Article struct {
|
||||
Title string
|
||||
Content string
|
||||
TextContent string
|
||||
Length int
|
||||
Excerpt string
|
||||
Byline string
|
||||
Dir string
|
||||
SiteName string
|
||||
Lang string
|
||||
PublishedTime string
|
||||
}
|
12
browser.go
Normal file
12
browser.go
Normal file
@ -0,0 +1,12 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
)
|
||||
|
||||
type Browser interface {
|
||||
io.Closer
|
||||
|
||||
Open(ctx context.Context, url string) (Source, error)
|
||||
}
|
20
cookiejar.go
Normal file
20
cookiejar.go
Normal file
@ -0,0 +1,20 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type Cookie struct {
|
||||
Name string
|
||||
Value string
|
||||
Domain string
|
||||
Path string
|
||||
Expires time.Time
|
||||
Secure bool
|
||||
HttpOnly bool
|
||||
}
|
||||
type CookieJar interface {
|
||||
GetAll() ([]Cookie, error)
|
||||
Set(cookie Cookie) error
|
||||
Delete(cookie Cookie) error
|
||||
}
|
7
extractor.go
Normal file
7
extractor.go
Normal file
@ -0,0 +1,7 @@
|
||||
package extractor
|
||||
|
||||
import "context"
|
||||
|
||||
type Extractor interface {
|
||||
Extract(ctx context.Context, src Source) (Article, error)
|
||||
}
|
18
go.mod
Normal file
18
go.mod
Normal file
@ -0,0 +1,18 @@
|
||||
module gitea.stevedudenhoeffer.com/steve/go-extractor
|
||||
|
||||
go 1.23.2
|
||||
|
||||
require github.com/playwright-community/playwright-go v0.4802.0
|
||||
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
|
||||
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
golang.org/x/net v0.32.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
)
|
218
playwright.go
Normal file
218
playwright.go
Normal file
@ -0,0 +1,218 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
type playWrightBrowser struct {
|
||||
browser playwright.Browser
|
||||
ctx playwright.BrowserContext
|
||||
userAgent string
|
||||
timeout time.Duration
|
||||
cookieJar CookieJar
|
||||
}
|
||||
|
||||
var _ Browser = playWrightBrowser{}
|
||||
|
||||
type PlayWrightBrowserSelection string
|
||||
|
||||
var (
|
||||
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
||||
ErrInvalidStatusCode = errors.New("invalid status code")
|
||||
)
|
||||
|
||||
const (
|
||||
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
||||
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
||||
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
||||
)
|
||||
|
||||
type PlayWrightBrowserOptions struct {
|
||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
|
||||
Browser PlayWrightBrowserSelection // If unset defaults to Chromium
|
||||
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
||||
|
||||
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
||||
// browser into the cookie jar for each request.
|
||||
CookieJar
|
||||
}
|
||||
|
||||
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
||||
return playwright.OptionalCookie{
|
||||
Name: cookie.Name,
|
||||
Value: cookie.Value,
|
||||
Domain: playwright.String(cookie.Domain),
|
||||
Path: playwright.String(cookie.Path),
|
||||
Expires: playwright.Float(float64(cookie.Expires.Unix())),
|
||||
HttpOnly: playwright.Bool(cookie.HttpOnly),
|
||||
}
|
||||
}
|
||||
|
||||
func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
||||
return Cookie{
|
||||
Name: cookie.Name,
|
||||
Value: cookie.Value,
|
||||
Domain: cookie.Domain,
|
||||
Path: cookie.Path,
|
||||
Expires: time.Unix(int64(cookie.Expires), 0),
|
||||
HttpOnly: cookie.HttpOnly,
|
||||
}
|
||||
}
|
||||
|
||||
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := PlayWrightBrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3",
|
||||
Browser: PlayWrightBrowserSelectionChromium,
|
||||
Timeout: &thirtySeconds,
|
||||
}
|
||||
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
opt.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
opt.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
opt.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
opt.CookieJar = o.CookieJar
|
||||
}
|
||||
}
|
||||
|
||||
err := playwright.Install()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pw, err := playwright.Run()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
|
||||
switch opt.Browser {
|
||||
case PlayWrightBrowserSelectionChromium:
|
||||
bt = pw.Chromium
|
||||
|
||||
case PlayWrightBrowserSelectionFirefox:
|
||||
bt = pw.Firefox
|
||||
|
||||
case PlayWrightBrowserSelectionWebKit:
|
||||
bt = pw.WebKit
|
||||
|
||||
default:
|
||||
return nil, ErrInvalidBrowserSelection
|
||||
}
|
||||
|
||||
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(true),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
||||
}
|
||||
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
|
||||
for i, cookie := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
||||
}
|
||||
|
||||
err = c.AddCookies(pwCookies)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return playWrightBrowser{
|
||||
browser: browser,
|
||||
userAgent: opt.UserAgent,
|
||||
timeout: *opt.Timeout,
|
||||
cookieJar: opt.CookieJar,
|
||||
ctx: c,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Open(_ context.Context, url string) (Source, error) {
|
||||
if b.userAgent == "" {
|
||||
b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3"
|
||||
}
|
||||
|
||||
page, err := b.ctx.NewPage()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts := playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateLoad,
|
||||
}
|
||||
|
||||
if b.timeout > 0 {
|
||||
var ms = float64(b.timeout.Milliseconds())
|
||||
opts.Timeout = &ms
|
||||
}
|
||||
resp, err := page.Goto(url, opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Info("response", "response", resp)
|
||||
|
||||
if resp.Status() != 200 {
|
||||
return nil, fmt.Errorf("%w: %d", ErrInvalidStatusCode, resp.Status)
|
||||
}
|
||||
|
||||
text, err := resp.Text()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if b.cookieJar != nil {
|
||||
cookies, err := page.Context().Cookies(page.URL())
|
||||
|
||||
for _, cookie := range cookies {
|
||||
// TODO: add support for deleting cookies from the jar which are deleted in the browser
|
||||
err = b.cookieJar.Set(playwrightCookieToCookie(cookie))
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error setting cookie in cookie jar: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return source{
|
||||
sourceUrl: url,
|
||||
content: text,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) Close() error {
|
||||
return errors.Join(
|
||||
b.ctx.Close(),
|
||||
b.browser.Close(),
|
||||
)
|
||||
}
|
7
processor.go
Normal file
7
processor.go
Normal file
@ -0,0 +1,7 @@
|
||||
package extractor
|
||||
|
||||
import "context"
|
||||
|
||||
type Processor interface {
|
||||
Process(ctx context.Context, src source) (source, error)
|
||||
}
|
45
readability.go
Normal file
45
readability.go
Normal file
@ -0,0 +1,45 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
|
||||
"github.com/go-shiori/go-readability"
|
||||
)
|
||||
|
||||
type Readability struct {
|
||||
Extractor
|
||||
}
|
||||
|
||||
var _ Extractor = Readability{}
|
||||
|
||||
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
|
||||
u, err := url.Parse(src.URL())
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
a, err := readability.FromReader(src.Reader(), u)
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
pubTime := ""
|
||||
|
||||
if a.PublishedTime != nil {
|
||||
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
|
||||
}
|
||||
return Article{
|
||||
Title: a.Title,
|
||||
Content: a.Content,
|
||||
TextContent: a.TextContent,
|
||||
Length: a.Length,
|
||||
Excerpt: a.Excerpt,
|
||||
Byline: a.Byline,
|
||||
SiteName: a.SiteName,
|
||||
Lang: a.Language,
|
||||
PublishedTime: pubTime,
|
||||
}, nil
|
||||
|
||||
}
|
29
source.go
Normal file
29
source.go
Normal file
@ -0,0 +1,29 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Source interface {
|
||||
URL() string
|
||||
String() string
|
||||
Reader() io.Reader
|
||||
}
|
||||
|
||||
type source struct {
|
||||
sourceUrl string
|
||||
content string
|
||||
}
|
||||
|
||||
func (s source) URL() string {
|
||||
return s.sourceUrl
|
||||
}
|
||||
|
||||
func (s source) String() string {
|
||||
return s.content
|
||||
}
|
||||
|
||||
func (s source) Reader() io.Reader {
|
||||
return strings.NewReader(s.content)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user