refactor: restructure API, deduplicate code, expand test coverage
- Extract shared DeferClose helper, removing 14 duplicate copies - Rename PlayWright-prefixed types to cleaner names (BrowserOptions, BrowserSelection, NewBrowser, etc.) - Rename fields: ServerAddress, RequireServer (was DontLaunchOnConnectFailure) - Extract shared initBrowser/mergeOptions into browser_init.go, deduplicating ~120 lines between NewBrowser and NewInteractiveBrowser - Remove unused locator field from document struct - Add tests for all previously untested packages (archive, aislegopher, wegmans, useragents, powerball) and expand existing test suites - Add MIGRATION.md documenting all breaking API changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
37
MIGRATION.md
Normal file
37
MIGRATION.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Migration Guide
|
||||
|
||||
This guide documents all breaking API changes from the restructuring of go-extractor.
|
||||
|
||||
All core interfaces (`Browser`, `Document`, `Node`, `CookieJar`, `InteractiveBrowser`) are **unchanged**.
|
||||
|
||||
## Type and Function Renames
|
||||
|
||||
```
|
||||
extractor.NewPlayWrightBrowser -> extractor.NewBrowser
|
||||
extractor.PlayWrightBrowserOptions -> extractor.BrowserOptions
|
||||
extractor.PlayWrightBrowserSelection -> extractor.BrowserSelection
|
||||
|
||||
extractor.PlayWrightBrowserSelectionChromium -> extractor.BrowserChromium
|
||||
extractor.PlayWrightBrowserSelectionFirefox -> extractor.BrowserFirefox
|
||||
extractor.PlayWrightBrowserSelectionWebKit -> extractor.BrowserWebKit
|
||||
```
|
||||
|
||||
## Field Renames (inside BrowserOptions)
|
||||
|
||||
```
|
||||
.PlayWrightServerAddress -> .ServerAddress
|
||||
.DontLaunchOnConnectFailure -> .RequireServer
|
||||
```
|
||||
|
||||
The `RequireServer` field is semantically identical to `DontLaunchOnConnectFailure`:
|
||||
|
||||
- Old: `DontLaunchOnConnectFailure: true` meant "fail if can't connect to server"
|
||||
- New: `RequireServer: true` means the same thing
|
||||
|
||||
## New Helper
|
||||
|
||||
```go
|
||||
extractor.DeferClose(closer)
|
||||
```
|
||||
|
||||
Nil-safe defer close helper. Replaces the `deferClose` functions that were previously copy-pasted across packages.
|
||||
29
article_test.go
Normal file
29
article_test.go
Normal file
@@ -0,0 +1,29 @@
|
||||
package extractor
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestArticle_ZeroValue(t *testing.T) {
|
||||
var a Article
|
||||
if a.Title != "" || a.Content != "" || a.Length != 0 {
|
||||
t.Error("zero-value Article should have empty fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestArticle_FieldAssignment(t *testing.T) {
|
||||
a := Article{
|
||||
Title: "Test Title",
|
||||
Content: "<p>hello</p>",
|
||||
TextContent: "hello",
|
||||
Length: 5,
|
||||
Excerpt: "hello",
|
||||
Byline: "Author",
|
||||
SiteName: "Example",
|
||||
Lang: "en",
|
||||
}
|
||||
if a.Title != "Test Title" {
|
||||
t.Errorf("Title = %q, want %q", a.Title, "Test Title")
|
||||
}
|
||||
if a.Length != 5 {
|
||||
t.Errorf("Length = %d, want 5", a.Length)
|
||||
}
|
||||
}
|
||||
160
browser_init.go
Normal file
160
browser_init.go
Normal file
@@ -0,0 +1,160 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
// browserInitResult holds the result of shared browser initialization.
|
||||
type browserInitResult struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
bctx playwright.BrowserContext
|
||||
opt BrowserOptions
|
||||
}
|
||||
|
||||
// initBrowser performs the shared browser initialization steps:
|
||||
// start Playwright, select browser type, connect or launch, create context, load cookies.
|
||||
func initBrowser(opt BrowserOptions) (*browserInitResult, error) {
|
||||
pw, err := playwright.Run()
|
||||
if err != nil {
|
||||
err = playwright.Install()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to install playwright: %w", err)
|
||||
}
|
||||
pw, err = playwright.Run()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to start playwright: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
switch opt.Browser {
|
||||
case BrowserChromium:
|
||||
bt = pw.Chromium
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM")
|
||||
}
|
||||
case BrowserFirefox:
|
||||
bt = pw.Firefox
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_FIREFOX")
|
||||
}
|
||||
case BrowserWebKit:
|
||||
bt = pw.WebKit
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_WEBKIT")
|
||||
}
|
||||
default:
|
||||
return nil, ErrInvalidBrowserSelection
|
||||
}
|
||||
|
||||
var browser playwright.Browser
|
||||
launch := true
|
||||
|
||||
if opt.ServerAddress != "" && !opt.UseLocalOnly {
|
||||
launch = false
|
||||
slog.Info("connecting to playwright server", "address", opt.ServerAddress)
|
||||
var timeout float64 = 30000
|
||||
browser, err = bt.Connect(opt.ServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
|
||||
if err != nil {
|
||||
if opt.RequireServer {
|
||||
return nil, err
|
||||
}
|
||||
slog.Warn("failed to connect to playwright server, launching local browser", "err", err)
|
||||
launch = true
|
||||
}
|
||||
}
|
||||
|
||||
if launch {
|
||||
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to launch browser: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var viewport *playwright.Size
|
||||
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||
viewport = &playwright.Size{
|
||||
Width: opt.Dimensions.Width,
|
||||
Height: opt.Dimensions.Height,
|
||||
}
|
||||
}
|
||||
|
||||
var scheme *playwright.ColorScheme
|
||||
if opt.DarkMode {
|
||||
scheme = playwright.ColorSchemeDark
|
||||
} else {
|
||||
scheme = playwright.ColorSchemeNoPreference
|
||||
}
|
||||
|
||||
bctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
Viewport: viewport,
|
||||
ColorScheme: scheme,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create browser context: %w", err)
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
||||
}
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
for i, c := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(c)
|
||||
}
|
||||
if err := bctx.AddCookies(pwCookies); err != nil {
|
||||
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return &browserInitResult{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
bctx: bctx,
|
||||
opt: opt,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// mergeOptions merges variadic BrowserOptions into a base set of defaults.
|
||||
func mergeOptions(base BrowserOptions, opts []BrowserOptions) BrowserOptions {
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
base.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
base.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
base.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
base.CookieJar = o.CookieJar
|
||||
}
|
||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||
base.Dimensions = o.Dimensions
|
||||
}
|
||||
if o.DarkMode {
|
||||
base.DarkMode = true
|
||||
}
|
||||
if o.ServerAddress != "" {
|
||||
base.ServerAddress = o.ServerAddress
|
||||
}
|
||||
if o.RequireServer {
|
||||
base.RequireServer = true
|
||||
}
|
||||
if o.UseLocalOnly {
|
||||
base.UseLocalOnly = true
|
||||
}
|
||||
base.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
return base
|
||||
}
|
||||
11
close.go
Normal file
11
close.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package extractor
|
||||
|
||||
import "io"
|
||||
|
||||
// DeferClose safely closes an io.Closer, ignoring the error.
|
||||
// Intended for use in defer statements.
|
||||
func DeferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
38
close_test.go
Normal file
38
close_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type mockCloser struct {
|
||||
closed bool
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockCloser) Close() error {
|
||||
m.closed = true
|
||||
return m.err
|
||||
}
|
||||
|
||||
func TestDeferClose_Nil(t *testing.T) {
|
||||
// Should not panic on nil.
|
||||
DeferClose(nil)
|
||||
}
|
||||
|
||||
func TestDeferClose_Valid(t *testing.T) {
|
||||
m := &mockCloser{}
|
||||
DeferClose(m)
|
||||
if !m.closed {
|
||||
t.Error("DeferClose did not call Close()")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeferClose_ErrorIgnored(t *testing.T) {
|
||||
m := &mockCloser{err: errors.New("close error")}
|
||||
// Should not panic even when Close returns an error.
|
||||
DeferClose(m)
|
||||
if !m.closed {
|
||||
t.Error("DeferClose did not call Close()")
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -12,9 +11,6 @@ import (
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}
|
||||
func main() {
|
||||
cmd := &cli.Command{
|
||||
Name: "browser",
|
||||
@@ -31,7 +27,7 @@ func main() {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
// now open the user specified url
|
||||
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
|
||||
@@ -39,7 +35,7 @@ func main() {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ var Flags = BrowserFlags{
|
||||
}
|
||||
|
||||
func FromCommand(ctx context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
||||
var opts extractor.PlayWrightBrowserOptions
|
||||
var opts extractor.BrowserOptions
|
||||
|
||||
if ua := cmd.String("user-agent"); ua != "" {
|
||||
opts.UserAgent = ua
|
||||
@@ -59,7 +59,7 @@ func FromCommand(ctx context.Context, cmd *cli.Command) (extractor.Browser, erro
|
||||
}
|
||||
|
||||
if b := cmd.String("browser"); b != "" {
|
||||
opts.Browser = extractor.PlayWrightBrowserSelection(b)
|
||||
opts.Browser = extractor.BrowserSelection(b)
|
||||
}
|
||||
|
||||
if cf := cmd.String("cookies-file"); cf != "" {
|
||||
@@ -72,5 +72,5 @@ func FromCommand(ctx context.Context, cmd *cli.Command) (extractor.Browser, erro
|
||||
|
||||
opts.ShowBrowser = cmd.Bool("visible")
|
||||
|
||||
return extractor.NewPlayWrightBrowser(ctx, opts)
|
||||
return extractor.NewBrowser(ctx, opts)
|
||||
}
|
||||
|
||||
@@ -25,7 +25,6 @@ type document struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
page playwright.Page
|
||||
locator playwright.Locator
|
||||
}
|
||||
|
||||
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
||||
|
||||
136
interactive.go
136
interactive.go
@@ -56,48 +56,17 @@ type interactiveBrowser struct {
|
||||
|
||||
// NewInteractiveBrowser creates a headless browser with a page ready for interactive control.
|
||||
// The context is only used for cancellation during setup.
|
||||
func NewInteractiveBrowser(ctx context.Context, opts ...PlayWrightBrowserOptions) (InteractiveBrowser, error) {
|
||||
func NewInteractiveBrowser(ctx context.Context, opts ...BrowserOptions) (InteractiveBrowser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := PlayWrightBrowserOptions{
|
||||
opt := mergeOptions(BrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
Browser: PlayWrightBrowserSelectionChromium,
|
||||
Browser: BrowserChromium,
|
||||
Timeout: &thirtySeconds,
|
||||
Dimensions: Size{
|
||||
Width: 1280,
|
||||
Height: 720,
|
||||
},
|
||||
}
|
||||
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
opt.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
opt.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
opt.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
opt.CookieJar = o.CookieJar
|
||||
}
|
||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||
opt.Dimensions = o.Dimensions
|
||||
}
|
||||
if o.DarkMode {
|
||||
opt.DarkMode = true
|
||||
}
|
||||
if o.PlayWrightServerAddress != "" {
|
||||
opt.PlayWrightServerAddress = o.PlayWrightServerAddress
|
||||
}
|
||||
if o.DontLaunchOnConnectFailure {
|
||||
opt.DontLaunchOnConnectFailure = true
|
||||
}
|
||||
if o.UseLocalOnly {
|
||||
opt.UseLocalOnly = true
|
||||
}
|
||||
opt.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
}, opts)
|
||||
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
@@ -111,98 +80,13 @@ func NewInteractiveBrowser(ctx context.Context, opts ...PlayWrightBrowserOptions
|
||||
ch := make(chan result, 1)
|
||||
|
||||
go func() {
|
||||
pw, err := playwright.Run()
|
||||
res, err := initBrowser(opt)
|
||||
if err != nil {
|
||||
err = playwright.Install()
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to install playwright: %w", err)}
|
||||
return
|
||||
}
|
||||
pw, err = playwright.Run()
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to start playwright: %w", err)}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
switch opt.Browser {
|
||||
case PlayWrightBrowserSelectionChromium:
|
||||
bt = pw.Chromium
|
||||
case PlayWrightBrowserSelectionFirefox:
|
||||
bt = pw.Firefox
|
||||
case PlayWrightBrowserSelectionWebKit:
|
||||
bt = pw.WebKit
|
||||
default:
|
||||
ch <- result{nil, ErrInvalidBrowserSelection}
|
||||
ch <- result{nil, err}
|
||||
return
|
||||
}
|
||||
|
||||
var browser playwright.Browser
|
||||
var launch = true
|
||||
|
||||
if opt.PlayWrightServerAddress != "" && !opt.UseLocalOnly {
|
||||
launch = false
|
||||
var timeout float64 = 30000
|
||||
browser, err = bt.Connect(opt.PlayWrightServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
|
||||
if err != nil {
|
||||
if opt.DontLaunchOnConnectFailure {
|
||||
ch <- result{nil, err}
|
||||
return
|
||||
}
|
||||
launch = true
|
||||
}
|
||||
}
|
||||
|
||||
if launch {
|
||||
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to launch browser: %w", err)}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
viewport := &playwright.Size{
|
||||
Width: opt.Dimensions.Width,
|
||||
Height: opt.Dimensions.Height,
|
||||
}
|
||||
|
||||
var scheme *playwright.ColorScheme
|
||||
if opt.DarkMode {
|
||||
scheme = playwright.ColorSchemeDark
|
||||
} else {
|
||||
scheme = playwright.ColorSchemeNoPreference
|
||||
}
|
||||
|
||||
bctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
Viewport: viewport,
|
||||
ColorScheme: scheme,
|
||||
})
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to create browser context: %w", err)}
|
||||
return
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)}
|
||||
return
|
||||
}
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
for i, c := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(c)
|
||||
}
|
||||
if err := bctx.AddCookies(pwCookies); err != nil {
|
||||
ch <- result{nil, fmt.Errorf("error adding cookies: %w", err)}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
page, err := bctx.NewPage()
|
||||
page, err := res.bctx.NewPage()
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to create page: %w", err)}
|
||||
return
|
||||
@@ -210,9 +94,9 @@ func NewInteractiveBrowser(ctx context.Context, opts ...PlayWrightBrowserOptions
|
||||
|
||||
ch <- result{
|
||||
ib: &interactiveBrowser{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
ctx: bctx,
|
||||
pw: res.pw,
|
||||
browser: res.browser,
|
||||
ctx: res.bctx,
|
||||
page: page,
|
||||
},
|
||||
}
|
||||
|
||||
16
mock_test.go
Normal file
16
mock_test.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package extractor
|
||||
|
||||
import "time"
|
||||
|
||||
// mockDocument implements the Document interface for testing without Playwright.
|
||||
type mockDocument struct {
|
||||
mockNode
|
||||
url string
|
||||
content string
|
||||
}
|
||||
|
||||
func (m mockDocument) URL() string { return m.url }
|
||||
func (m mockDocument) Refresh() error { return nil }
|
||||
func (m mockDocument) Content() (string, error) { return m.content, nil }
|
||||
func (m mockDocument) Close() error { return nil }
|
||||
func (m mockDocument) WaitForNetworkIdle(_ *time.Duration) error { return nil }
|
||||
23
node_test.go
Normal file
23
node_test.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package extractor
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestEscapeJavaScript(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"hello", "hello"},
|
||||
{"it's", "it\\'s"},
|
||||
{`back\slash`, `back\\slash`},
|
||||
{`both\'`, `both\\\'`},
|
||||
{"", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := escapeJavaScript(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("escapeJavaScript(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
214
playwright.go
214
playwright.go
@@ -4,9 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
@@ -24,7 +22,7 @@ type playWrightBrowser struct {
|
||||
|
||||
var _ Browser = playWrightBrowser{}
|
||||
|
||||
type PlayWrightBrowserSelection string
|
||||
type BrowserSelection string
|
||||
|
||||
var (
|
||||
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
||||
@@ -33,18 +31,18 @@ var (
|
||||
)
|
||||
|
||||
const (
|
||||
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
||||
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
||||
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
||||
BrowserChromium BrowserSelection = "chromium"
|
||||
BrowserFirefox BrowserSelection = "firefox"
|
||||
BrowserWebKit BrowserSelection = "webkit"
|
||||
)
|
||||
|
||||
type Size struct {
|
||||
Width int
|
||||
Height int
|
||||
}
|
||||
type PlayWrightBrowserOptions struct {
|
||||
type BrowserOptions struct {
|
||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
||||
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
||||
Browser BrowserSelection // If unset defaults to Firefox.
|
||||
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
||||
|
||||
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
||||
@@ -56,15 +54,15 @@ type PlayWrightBrowserOptions struct {
|
||||
Dimensions Size
|
||||
DarkMode bool
|
||||
|
||||
// PlayWrightServerAddress is the address of a PlayWright server to connect to.
|
||||
// ServerAddress is the address of a Playwright server to connect to.
|
||||
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
|
||||
PlayWrightServerAddress string
|
||||
ServerAddress string
|
||||
|
||||
// DontLaunchOnConnectFailure will, if set, not launch the browser if the connection to the PlayWright server,
|
||||
// and return an error if the connection fails.
|
||||
DontLaunchOnConnectFailure bool
|
||||
// RequireServer will, if set, return an error if the connection to the
|
||||
// Playwright server fails instead of falling back to a local browser launch.
|
||||
RequireServer bool
|
||||
|
||||
// UseLocalOnly will, if set, not connect to the PlayWright server, and instead use the local PlayWright server.
|
||||
// UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser.
|
||||
UseLocalOnly bool
|
||||
}
|
||||
|
||||
@@ -90,48 +88,14 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
||||
}
|
||||
}
|
||||
|
||||
func NewPlayWrightBrowser(ctx context.Context, opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := PlayWrightBrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
Browser: PlayWrightBrowserSelectionFirefox,
|
||||
Timeout: &thirtySeconds,
|
||||
DarkMode: false,
|
||||
PlayWrightServerAddress: "",
|
||||
}
|
||||
opt := mergeOptions(BrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
Browser: BrowserFirefox,
|
||||
Timeout: &thirtySeconds,
|
||||
}, opts)
|
||||
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
opt.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
opt.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
opt.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
opt.CookieJar = o.CookieJar
|
||||
}
|
||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||
opt.Dimensions = o.Dimensions
|
||||
}
|
||||
if o.DarkMode {
|
||||
opt.DarkMode = true
|
||||
}
|
||||
if o.PlayWrightServerAddress != "" {
|
||||
opt.PlayWrightServerAddress = o.PlayWrightServerAddress
|
||||
}
|
||||
if o.DontLaunchOnConnectFailure {
|
||||
opt.DontLaunchOnConnectFailure = true
|
||||
}
|
||||
if o.UseLocalOnly {
|
||||
opt.UseLocalOnly = true
|
||||
}
|
||||
opt.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
|
||||
// Check if context is already done
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -141,145 +105,28 @@ func NewPlayWrightBrowser(ctx context.Context, opts ...PlayWrightBrowserOptions)
|
||||
err error
|
||||
}
|
||||
|
||||
// Create a channel for the result
|
||||
resultCh := make(chan browserResult, 1)
|
||||
|
||||
// Launch browser initialization in a separate goroutine
|
||||
go func() {
|
||||
pw, err := playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
err = playwright.Install()
|
||||
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
|
||||
pw, err = playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
|
||||
switch opt.Browser {
|
||||
case PlayWrightBrowserSelectionChromium:
|
||||
bt = pw.Chromium
|
||||
if opt.PlayWrightServerAddress == "" {
|
||||
opt.PlayWrightServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM")
|
||||
}
|
||||
|
||||
case PlayWrightBrowserSelectionFirefox:
|
||||
bt = pw.Firefox
|
||||
if opt.PlayWrightServerAddress == "" {
|
||||
opt.PlayWrightServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_FIREFOX")
|
||||
}
|
||||
|
||||
case PlayWrightBrowserSelectionWebKit:
|
||||
bt = pw.WebKit
|
||||
if opt.PlayWrightServerAddress == "" {
|
||||
opt.PlayWrightServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_WEBKIT")
|
||||
}
|
||||
|
||||
default:
|
||||
resultCh <- browserResult{nil, ErrInvalidBrowserSelection}
|
||||
return
|
||||
}
|
||||
var browser playwright.Browser
|
||||
|
||||
var launch = true
|
||||
if opt.PlayWrightServerAddress != "" && !opt.UseLocalOnly {
|
||||
launch = false
|
||||
slog.Info("connecting to playwright server", "address", opt.PlayWrightServerAddress)
|
||||
var timeout float64 = 30000
|
||||
browser, err = bt.Connect(opt.PlayWrightServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
|
||||
|
||||
if err != nil {
|
||||
if opt.DontLaunchOnConnectFailure {
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
slog.Warn("failed to connect to playwright server, launching local browser", "err", err)
|
||||
launch = true
|
||||
}
|
||||
}
|
||||
|
||||
if launch {
|
||||
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var viewport *playwright.Size
|
||||
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||
viewport = &playwright.Size{
|
||||
Width: opt.Dimensions.Width,
|
||||
Height: opt.Dimensions.Height,
|
||||
}
|
||||
}
|
||||
|
||||
var scheme *playwright.ColorScheme
|
||||
|
||||
if opt.DarkMode {
|
||||
scheme = playwright.ColorSchemeDark
|
||||
} else {
|
||||
scheme = playwright.ColorSchemeNoPreference
|
||||
}
|
||||
|
||||
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
Viewport: viewport,
|
||||
ColorScheme: scheme,
|
||||
})
|
||||
res, err := initBrowser(opt)
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)}
|
||||
return
|
||||
}
|
||||
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
|
||||
for i, cookie := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
||||
}
|
||||
|
||||
err = c.AddCookies(pwCookies)
|
||||
|
||||
if err != nil {
|
||||
resultCh <- browserResult{nil, fmt.Errorf("error adding cookies to browser: %w", err)}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
resultCh <- browserResult{
|
||||
browser: playWrightBrowser{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
userAgent: opt.UserAgent,
|
||||
timeout: *opt.Timeout,
|
||||
cookieJar: opt.CookieJar,
|
||||
ctx: c,
|
||||
serverAddr: opt.PlayWrightServerAddress,
|
||||
pw: res.pw,
|
||||
browser: res.browser,
|
||||
userAgent: res.opt.UserAgent,
|
||||
timeout: *res.opt.Timeout,
|
||||
cookieJar: res.opt.CookieJar,
|
||||
ctx: res.bctx,
|
||||
serverAddr: res.opt.ServerAddress,
|
||||
},
|
||||
err: nil,
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for either context cancellation or browser initialization completion
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
@@ -367,12 +214,9 @@ func (b playWrightBrowser) Close() error {
|
||||
)
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}
|
||||
|
||||
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
||||
browser, err := NewPlayWrightBrowser(ctx, PlayWrightBrowserOptions{
|
||||
browser, err := NewBrowser(ctx, BrowserOptions{
|
||||
Timeout: &timeout,
|
||||
})
|
||||
|
||||
@@ -380,14 +224,14 @@ func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]by
|
||||
return nil, fmt.Errorf("error creating browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(browser)
|
||||
defer DeferClose(browser)
|
||||
|
||||
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error opening page: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer DeferClose(doc)
|
||||
|
||||
return doc.Screenshot()
|
||||
}
|
||||
|
||||
72
readability_test.go
Normal file
72
readability_test.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReadability_ValidHTML(t *testing.T) {
|
||||
html := `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Article</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Test Article</h1>
|
||||
<p>This is a test article with enough content to be parsed by readability.
|
||||
It needs to have a reasonable amount of text so the algorithm considers it
|
||||
a valid article. Let us add several sentences to make sure this works
|
||||
correctly. The readability library requires a minimum amount of content
|
||||
to successfully extract an article from a page.</p>
|
||||
<p>Here is another paragraph to add more content. We want to make sure
|
||||
that the content is substantial enough for the readability algorithm to
|
||||
consider this a valid article and extract the text properly.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/article",
|
||||
content: html,
|
||||
}
|
||||
|
||||
article, err := Readability(context.Background(), doc)
|
||||
if err != nil {
|
||||
t.Fatalf("Readability() error = %v", err)
|
||||
}
|
||||
|
||||
if article.Title != "Test Article" {
|
||||
t.Errorf("Title = %q, want %q", article.Title, "Test Article")
|
||||
}
|
||||
|
||||
if article.TextContent == "" {
|
||||
t.Error("TextContent should not be empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadability_EmptyContent(t *testing.T) {
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/empty",
|
||||
content: "",
|
||||
}
|
||||
|
||||
article, err := Readability(context.Background(), doc)
|
||||
if err != nil {
|
||||
t.Fatalf("Readability() unexpected error = %v", err)
|
||||
}
|
||||
// Empty content should produce an empty article.
|
||||
if article.Title != "" && article.TextContent != "" {
|
||||
t.Error("expected empty article from empty content")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadability_InvalidURL(t *testing.T) {
|
||||
doc := mockDocument{
|
||||
url: "://invalid",
|
||||
content: "<html><body><p>text</p></body></html>",
|
||||
}
|
||||
|
||||
_, err := Readability(context.Background(), doc)
|
||||
if err == nil {
|
||||
t.Error("Readability() expected error for invalid URL, got nil")
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -27,11 +26,6 @@ type Item struct {
|
||||
Price float64
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
||||
}
|
||||
@@ -57,7 +51,7 @@ func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.
|
||||
res.ID, _ = strconv.Atoi(a[3])
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to open page: %w", err)
|
||||
}
|
||||
|
||||
39
sites/aislegopher/aislegopher_test.go
Normal file
39
sites/aislegopher/aislegopher_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package aislegopher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetItemFromURL_InvalidHost(t *testing.T) {
|
||||
u, _ := url.Parse("https://example.com/p/slug/123")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_NoP(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/x/slug/123")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_TooShort(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/p/slug")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_TooLong(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/p/slug/123/extra")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
@@ -3,10 +3,10 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -22,11 +22,6 @@ func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
flags = append(flags, browser.Flags...)
|
||||
@@ -44,7 +39,7 @@ func main() {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
arg := c.Args().First()
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strings"
|
||||
@@ -39,12 +38,6 @@ func (c Config) validate() Config {
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
|
||||
37
sites/archive/archive_test.go
Normal file
37
sites/archive/archive_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package archive
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.ph" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
|
||||
}
|
||||
if c.Timeout == nil {
|
||||
t.Fatal("Timeout should not be nil after validate")
|
||||
}
|
||||
if *c.Timeout != 1*time.Hour {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||
timeout := 5 * time.Minute
|
||||
c := Config{
|
||||
Endpoint: "https://archive.org",
|
||||
Timeout: &timeout,
|
||||
}
|
||||
c = c.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.org" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
|
||||
}
|
||||
if *c.Timeout != 5*time.Minute {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
|
||||
}
|
||||
}
|
||||
@@ -3,12 +3,13 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/urfave/cli/v3"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||
)
|
||||
@@ -49,12 +50,6 @@ func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) (duckduckgo.Config, error) {
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
@@ -78,7 +73,7 @@ func main() {
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, command)
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
@@ -89,7 +84,7 @@ func main() {
|
||||
return fmt.Errorf("failed to open search: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(search)
|
||||
defer extractor.DeferClose(search)
|
||||
|
||||
res := search.GetResults()
|
||||
fmt.Println("Results:", res)
|
||||
|
||||
@@ -3,7 +3,6 @@ package duckduckgo
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
|
||||
@@ -71,12 +70,6 @@ type Result struct {
|
||||
Description string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
|
||||
u := c.ToSearchURL(query)
|
||||
|
||||
@@ -97,7 +90,7 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
||||
|
||||
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
|
||||
@@ -83,3 +83,34 @@ func TestConfig_ToSearchURL_NoRegion(t *testing.T) {
|
||||
t.Errorf("kl should be empty when no region, got %q", u.Query().Get("kl"))
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_Scheme(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Scheme != "https" {
|
||||
t.Errorf("Scheme = %q, want %q", u.Scheme, "https")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_SpecialChars(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("go lang & testing")
|
||||
|
||||
if u.Query().Get("q") != "go lang & testing" {
|
||||
t.Errorf("q = %q, want %q", u.Query().Get("q"), "go lang & testing")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResult_ZeroValue(t *testing.T) {
|
||||
var r Result
|
||||
if r.URL != "" || r.Title != "" || r.Description != "" {
|
||||
t.Error("zero-value Result should have empty fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultConfig_SafeSearch(t *testing.T) {
|
||||
if DefaultConfig.SafeSearch != SafeSearchOff {
|
||||
t.Errorf("DefaultConfig.SafeSearch = %d, want %d", DefaultConfig.SafeSearch, SafeSearchOff)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,12 +3,12 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google"
|
||||
)
|
||||
@@ -42,12 +42,6 @@ func (f GoogleFlags) ToConfig(_ context.Context, cmd *cli.Command) google.Config
|
||||
return c
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
@@ -67,7 +61,7 @@ func main() {
|
||||
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -3,7 +3,6 @@ package google
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
@@ -48,12 +47,6 @@ type Result struct {
|
||||
Description string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
||||
c = c.validate()
|
||||
|
||||
@@ -99,7 +92,7 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
var res []Result
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ package megamillions
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -33,12 +32,6 @@ type NextDrawing struct {
|
||||
Jackpot currency.Amount
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func netTicksToTime(t int64) time.Time {
|
||||
return time.Unix(0, t*100).Add(-621355968000000000)
|
||||
}
|
||||
@@ -218,7 +211,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
|
||||
@@ -41,3 +41,33 @@ func TestNetTicksToTime_DifferenceIsCorrect(t *testing.T) {
|
||||
t.Errorf("expected 1 second difference, got %v", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetTicksToTime_NotZero(t *testing.T) {
|
||||
// Verify the function produces a non-zero time for typical ticks values.
|
||||
ticks := int64(638396256000000000)
|
||||
result := netTicksToTime(ticks)
|
||||
|
||||
if result.IsZero() {
|
||||
t.Error("netTicksToTime should not return zero time for valid ticks")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
_ = c // validate is a no-op, just verify no panic
|
||||
}
|
||||
|
||||
func TestDrawing_ZeroValue(t *testing.T) {
|
||||
var d Drawing
|
||||
if d.MegaBall != 0 || d.Megaplier != 0 {
|
||||
t.Error("zero-value Drawing should have zero fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||
var nd NextDrawing
|
||||
if nd.Date != "" {
|
||||
t.Error("zero-value NextDrawing should have empty date")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package powerball
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -32,12 +31,6 @@ type NextDrawing struct {
|
||||
JackpotDollars int
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||
var drawing Drawing
|
||||
|
||||
@@ -196,7 +189,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
|
||||
34
sites/powerball/powerball_test.go
Normal file
34
sites/powerball/powerball_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package powerball
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestConfig_Validate(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
// validate is a no-op for powerball Config, just verify it doesn't panic.
|
||||
_ = c
|
||||
}
|
||||
|
||||
func TestDefaultConfig(t *testing.T) {
|
||||
c := DefaultConfig
|
||||
_ = c
|
||||
}
|
||||
|
||||
func TestDrawing_ZeroValue(t *testing.T) {
|
||||
var d Drawing
|
||||
if d.PowerBall != 0 || d.PowerPlay != 0 {
|
||||
t.Error("zero-value Drawing should have zero fields")
|
||||
}
|
||||
for i, n := range d.Numbers {
|
||||
if n != 0 {
|
||||
t.Errorf("Numbers[%d] = %d, want 0", i, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||
var nd NextDrawing
|
||||
if nd.Date != "" || nd.JackpotDollars != 0 {
|
||||
t.Error("zero-value NextDrawing should have empty/zero fields")
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,6 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
@@ -13,12 +11,6 @@ type Config struct{}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.Browser) (string, error) {
|
||||
return DefaultConfig.GetMostCommonDesktopUserAgent(ctx, b)
|
||||
}
|
||||
@@ -30,7 +22,7 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
||||
return "", fmt.Errorf("failed to open useragents.me: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
s := doc.Select("#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea:nth-child(4)")
|
||||
|
||||
text := ""
|
||||
|
||||
9
sites/useragents/useragents_test.go
Normal file
9
sites/useragents/useragents_test.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package useragents
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDefaultConfig(t *testing.T) {
|
||||
// DefaultConfig should be a zero-value Config.
|
||||
c := DefaultConfig
|
||||
_ = c // Just verify it exists and is usable.
|
||||
}
|
||||
@@ -3,10 +3,10 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -14,12 +14,6 @@ import (
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
type WegmansFlags []cli.Flag
|
||||
|
||||
var Flags = WegmansFlags{}
|
||||
@@ -44,7 +38,7 @@ func main() {
|
||||
cfg := Flags.ToConfig(cmd)
|
||||
|
||||
b, err := browser.FromCommand(ctx, cmd)
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating browser: %w", err)
|
||||
|
||||
@@ -3,7 +3,6 @@ package wegmans
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strconv"
|
||||
@@ -30,12 +29,6 @@ type Item struct {
|
||||
Unit string
|
||||
}
|
||||
|
||||
func deferClose(c io.Closer) {
|
||||
if c != nil {
|
||||
_ = c.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
|
||||
if b == nil {
|
||||
@@ -68,7 +61,7 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return Item{}, err
|
||||
|
||||
39
sites/wegmans/wegmans_test.go
Normal file
39
sites/wegmans/wegmans_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package wegmans
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetItemPrice_NilBrowser(t *testing.T) {
|
||||
u, _ := url.Parse("https://shop.wegmans.com/product/24921")
|
||||
_, err := DefaultConfig.GetItemPrice(context.Background(), nil, u)
|
||||
if err != ErrNilBrowser {
|
||||
t.Errorf("expected ErrNilBrowser, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemPrice_NilURL(t *testing.T) {
|
||||
// NilBrowser check comes before NilURL, so we can't test NilURL
|
||||
// independently without a real browser. Verify the error sentinel exists.
|
||||
if ErrNilURL.Error() != "url is nil" {
|
||||
t.Errorf("ErrNilURL = %q, want %q", ErrNilURL.Error(), "url is nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemPrice_ErrorSentinels(t *testing.T) {
|
||||
if ErrInvalidURL.Error() != "invalid url" {
|
||||
t.Errorf("ErrInvalidURL = %q, want %q", ErrInvalidURL.Error(), "invalid url")
|
||||
}
|
||||
if ErrNilBrowser.Error() != "browser is nil" {
|
||||
t.Errorf("ErrNilBrowser = %q, want %q", ErrNilBrowser.Error(), "browser is nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestItem_ZeroValue(t *testing.T) {
|
||||
var item Item
|
||||
if item.ID != 0 || item.Name != "" || item.Price != 0 || item.UnitPrice != 0 || item.Unit != "" {
|
||||
t.Error("zero-value Item should have empty/zero fields")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user