Compare commits
23 Commits
203b97d957
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 05ca15b165 | |||
| 294097c3b6 | |||
| 022e002f98 | |||
| 51ce639994 | |||
| cb2ed10cfd | |||
| e7b7e78796 | |||
| e807dbb2ff | |||
| 52a9cb585d | |||
| 868acfae40 | |||
| 82fce5a200 | |||
| 5fe7313fa4 | |||
| 39c2c7d37a | |||
| e32a6fa791 | |||
| afa0238758 | |||
| 9ae8619f93 | |||
| f4caef22b0 | |||
| 9947cae947 | |||
| dc43d1626a | |||
| 2d60940001 | |||
| d0fffb0411 | |||
| 8b4e43c40f | |||
| 6f4ca22b6a | |||
| 8aee8f0502 |
35
.gitea/workflows/ci.yml
Normal file
35
.gitea/workflows/ci.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: https://gitea.com/actions/checkout@v4
|
||||
- uses: https://gitea.com/actions/setup-go@v3
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
- run: go build ./...
|
||||
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: https://gitea.com/actions/checkout@v4
|
||||
- uses: https://gitea.com/actions/setup-go@v3
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
- run: go test ./...
|
||||
|
||||
vet:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: https://gitea.com/actions/checkout@v4
|
||||
- uses: https://gitea.com/actions/setup-go@v3
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
- run: go vet ./...
|
||||
37
MIGRATION.md
Normal file
37
MIGRATION.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Migration Guide
|
||||
|
||||
This guide documents all breaking API changes from the restructuring of go-extractor.
|
||||
|
||||
All core interfaces (`Browser`, `Document`, `Node`, `CookieJar`, `InteractiveBrowser`) are **unchanged**.
|
||||
|
||||
## Type and Function Renames
|
||||
|
||||
```
|
||||
extractor.NewPlayWrightBrowser -> extractor.NewBrowser
|
||||
extractor.PlayWrightBrowserOptions -> extractor.BrowserOptions
|
||||
extractor.PlayWrightBrowserSelection -> extractor.BrowserSelection
|
||||
|
||||
extractor.PlayWrightBrowserSelectionChromium -> extractor.BrowserChromium
|
||||
extractor.PlayWrightBrowserSelectionFirefox -> extractor.BrowserFirefox
|
||||
extractor.PlayWrightBrowserSelectionWebKit -> extractor.BrowserWebKit
|
||||
```
|
||||
|
||||
## Field Renames (inside BrowserOptions)
|
||||
|
||||
```
|
||||
.PlayWrightServerAddress -> .ServerAddress
|
||||
.DontLaunchOnConnectFailure -> .RequireServer
|
||||
```
|
||||
|
||||
The `RequireServer` field is semantically identical to `DontLaunchOnConnectFailure`:
|
||||
|
||||
- Old: `DontLaunchOnConnectFailure: true` meant "fail if can't connect to server"
|
||||
- New: `RequireServer: true` means the same thing
|
||||
|
||||
## New Helper
|
||||
|
||||
```go
|
||||
extractor.DeferClose(closer)
|
||||
```
|
||||
|
||||
Nil-safe defer close helper. Replaces the `deferClose` functions that were previously copy-pasted across packages.
|
||||
29
article_test.go
Normal file
29
article_test.go
Normal file
@@ -0,0 +1,29 @@
|
||||
package extractor
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestArticle_ZeroValue(t *testing.T) {
|
||||
var a Article
|
||||
if a.Title != "" || a.Content != "" || a.Length != 0 {
|
||||
t.Error("zero-value Article should have empty fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestArticle_FieldAssignment(t *testing.T) {
|
||||
a := Article{
|
||||
Title: "Test Title",
|
||||
Content: "<p>hello</p>",
|
||||
TextContent: "hello",
|
||||
Length: 5,
|
||||
Excerpt: "hello",
|
||||
Byline: "Author",
|
||||
SiteName: "Example",
|
||||
Lang: "en",
|
||||
}
|
||||
if a.Title != "Test Title" {
|
||||
t.Errorf("Title = %q, want %q", a.Title, "Test Title")
|
||||
}
|
||||
if a.Length != 5 {
|
||||
t.Errorf("Length = %d, want 5", a.Length)
|
||||
}
|
||||
}
|
||||
160
browser_init.go
Normal file
160
browser_init.go
Normal file
@@ -0,0 +1,160 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
// browserInitResult holds the result of shared browser initialization.
|
||||
type browserInitResult struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
bctx playwright.BrowserContext
|
||||
opt BrowserOptions
|
||||
}
|
||||
|
||||
// initBrowser performs the shared browser initialization steps:
|
||||
// start Playwright, select browser type, connect or launch, create context, load cookies.
|
||||
func initBrowser(opt BrowserOptions) (*browserInitResult, error) {
|
||||
pw, err := playwright.Run()
|
||||
if err != nil {
|
||||
err = playwright.Install()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to install playwright: %w", err)
|
||||
}
|
||||
pw, err = playwright.Run()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to start playwright: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
switch opt.Browser {
|
||||
case BrowserChromium:
|
||||
bt = pw.Chromium
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM")
|
||||
}
|
||||
case BrowserFirefox:
|
||||
bt = pw.Firefox
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_FIREFOX")
|
||||
}
|
||||
case BrowserWebKit:
|
||||
bt = pw.WebKit
|
||||
if opt.ServerAddress == "" {
|
||||
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_WEBKIT")
|
||||
}
|
||||
default:
|
||||
return nil, ErrInvalidBrowserSelection
|
||||
}
|
||||
|
||||
var browser playwright.Browser
|
||||
launch := true
|
||||
|
||||
if opt.ServerAddress != "" && !opt.UseLocalOnly {
|
||||
launch = false
|
||||
slog.Info("connecting to playwright server", "address", opt.ServerAddress)
|
||||
var timeout float64 = 30000
|
||||
browser, err = bt.Connect(opt.ServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
|
||||
if err != nil {
|
||||
if opt.RequireServer {
|
||||
return nil, err
|
||||
}
|
||||
slog.Warn("failed to connect to playwright server, launching local browser", "err", err)
|
||||
launch = true
|
||||
}
|
||||
}
|
||||
|
||||
if launch {
|
||||
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to launch browser: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var viewport *playwright.Size
|
||||
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||
viewport = &playwright.Size{
|
||||
Width: opt.Dimensions.Width,
|
||||
Height: opt.Dimensions.Height,
|
||||
}
|
||||
}
|
||||
|
||||
var scheme *playwright.ColorScheme
|
||||
if opt.DarkMode {
|
||||
scheme = playwright.ColorSchemeDark
|
||||
} else {
|
||||
scheme = playwright.ColorSchemeNoPreference
|
||||
}
|
||||
|
||||
bctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
Viewport: viewport,
|
||||
ColorScheme: scheme,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create browser context: %w", err)
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
||||
}
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
for i, c := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(c)
|
||||
}
|
||||
if err := bctx.AddCookies(pwCookies); err != nil {
|
||||
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return &browserInitResult{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
bctx: bctx,
|
||||
opt: opt,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// mergeOptions merges variadic BrowserOptions into a base set of defaults.
|
||||
func mergeOptions(base BrowserOptions, opts []BrowserOptions) BrowserOptions {
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
base.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
base.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
base.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
base.CookieJar = o.CookieJar
|
||||
}
|
||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||
base.Dimensions = o.Dimensions
|
||||
}
|
||||
if o.DarkMode {
|
||||
base.DarkMode = true
|
||||
}
|
||||
if o.ServerAddress != "" {
|
||||
base.ServerAddress = o.ServerAddress
|
||||
}
|
||||
if o.RequireServer {
|
||||
base.RequireServer = true
|
||||
}
|
||||
if o.UseLocalOnly {
|
||||
base.UseLocalOnly = true
|
||||
}
|
||||
base.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
return base
|
||||
}
|
||||
11
close.go
Normal file
11
close.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package extractor
|
||||
|
||||
import "io"
|
||||
|
||||
// DeferClose safely closes an io.Closer, ignoring the error.
|
||||
// Intended for use in defer statements.
|
||||
func DeferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
38
close_test.go
Normal file
38
close_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type mockCloser struct {
|
||||
closed bool
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockCloser) Close() error {
|
||||
m.closed = true
|
||||
return m.err
|
||||
}
|
||||
|
||||
func TestDeferClose_Nil(t *testing.T) {
|
||||
// Should not panic on nil.
|
||||
DeferClose(nil)
|
||||
}
|
||||
|
||||
func TestDeferClose_Valid(t *testing.T) {
|
||||
m := &mockCloser{}
|
||||
DeferClose(m)
|
||||
if !m.closed {
|
||||
t.Error("DeferClose did not call Close()")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeferClose_ErrorIgnored(t *testing.T) {
|
||||
m := &mockCloser{err: errors.New("close error")}
|
||||
// Should not panic even when Close returns an error.
|
||||
DeferClose(m)
|
||||
if !m.closed {
|
||||
t.Error("DeferClose did not call Close()")
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -12,9 +11,6 @@ import (
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}
|
||||
func main() {
|
||||
cmd := &cli.Command{
|
||||
Name: "browser",
|
||||
@@ -31,7 +27,7 @@ func main() {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
// now open the user specified url
|
||||
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
|
||||
@@ -39,7 +35,7 @@ func main() {
|
||||
return err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
|
||||
@@ -74,6 +70,7 @@ func main() {
|
||||
err := cmd.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,8 +43,8 @@ var Flags = BrowserFlags{
|
||||
},
|
||||
}
|
||||
|
||||
func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
||||
var opts extractor.PlayWrightBrowserOptions
|
||||
func FromCommand(ctx context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
||||
var opts extractor.BrowserOptions
|
||||
|
||||
if ua := cmd.String("user-agent"); ua != "" {
|
||||
opts.UserAgent = ua
|
||||
@@ -59,7 +59,7 @@ func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error)
|
||||
}
|
||||
|
||||
if b := cmd.String("browser"); b != "" {
|
||||
opts.Browser = extractor.PlayWrightBrowserSelection(b)
|
||||
opts.Browser = extractor.BrowserSelection(b)
|
||||
}
|
||||
|
||||
if cf := cmd.String("cookies-file"); cf != "" {
|
||||
@@ -72,5 +72,5 @@ func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error)
|
||||
|
||||
opts.ShowBrowser = cmd.Bool("visible")
|
||||
|
||||
return extractor.NewPlayWrightBrowser(opts)
|
||||
return extractor.NewBrowser(ctx, opts)
|
||||
}
|
||||
|
||||
38
cookiejar.go
38
cookiejar.go
@@ -25,26 +25,28 @@ func (c Cookie) IsTargetMatch(target string) (bool, error) {
|
||||
// the host of the cookie is the same as the host of the target
|
||||
// if the cookie host starts with a dot, that means it matches any subdomain
|
||||
if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) {
|
||||
if c.Path != "" {
|
||||
if !strings.HasPrefix(u.Path, c.Path) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// if the cookie path is a prefix of the target path, then it's a match
|
||||
// so now these would both match:
|
||||
// cookie path: /foo
|
||||
// target path: /foo/bar
|
||||
// cookie path: /foo
|
||||
// target path: /foosball
|
||||
// because foseball is not an actual match, we need to check to see that either the path is an exact match
|
||||
// or that the next character in the target path is a slash
|
||||
|
||||
if len(u.Path) > len(c.Path) && u.Path[len(c.Path)] != '/' {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
if c.Path == "" {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(u.Path, c.Path) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// if the cookie path is a prefix of the target path, then it's a match
|
||||
// so now these would both match:
|
||||
// cookie path: /foo
|
||||
// target path: /foo/bar
|
||||
// cookie path: /foo
|
||||
// target path: /foosball
|
||||
// because foseball is not an actual match, we need to check to see that either the path is an exact match
|
||||
// or that the next character in the target path is a slash
|
||||
|
||||
if len(u.Path) > len(c.Path) && !strings.HasSuffix(c.Path, "/") && u.Path[len(c.Path)] != '/' {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return false, nil
|
||||
|
||||
266
cookiejar_test.go
Normal file
266
cookiejar_test.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCookie_IsTargetMatch_ExactHost(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: "/"}
|
||||
match, err := c.IsTargetMatch("https://example.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !match {
|
||||
t.Error("expected match for exact host")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_DotPrefix(t *testing.T) {
|
||||
c := Cookie{Host: ".example.com", Path: "/"}
|
||||
match, err := c.IsTargetMatch("https://sub.example.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !match {
|
||||
t.Error("expected match for .example.com against sub.example.com")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_DotPrefix_NoFalsePositive(t *testing.T) {
|
||||
c := Cookie{Host: ".example.com", Path: "/"}
|
||||
match, err := c.IsTargetMatch("https://notexample.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if match {
|
||||
t.Error("did not expect .example.com to match notexample.com")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_PathExact(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||
match, err := c.IsTargetMatch("https://example.com/foo")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !match {
|
||||
t.Error("expected match for exact path /foo")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_PathPrefix(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||
match, err := c.IsTargetMatch("https://example.com/foo/bar")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !match {
|
||||
t.Error("expected match for /foo prefix with /foo/bar")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_PathBoundary(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||
match, err := c.IsTargetMatch("https://example.com/foosball")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if match {
|
||||
t.Error("did not expect /foo to match /foosball")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_EmptyPath(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: ""}
|
||||
match, err := c.IsTargetMatch("https://example.com/anything")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !match {
|
||||
t.Error("expected empty path cookie to match any path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_NoMatch(t *testing.T) {
|
||||
c := Cookie{Host: "other.com", Path: "/"}
|
||||
match, err := c.IsTargetMatch("https://example.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if match {
|
||||
t.Error("did not expect other.com to match example.com")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCookie_IsTargetMatch_InvalidURL(t *testing.T) {
|
||||
c := Cookie{Host: "example.com", Path: "/"}
|
||||
_, err := c.IsTargetMatch("://invalid")
|
||||
if err == nil {
|
||||
t.Error("expected error for invalid URL")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_GetAll(t *testing.T) {
|
||||
jar := &staticCookieJar{
|
||||
Cookie{Host: "a.com", Name: "a", Value: "1"},
|
||||
Cookie{Host: "b.com", Name: "b", Value: "2"},
|
||||
}
|
||||
|
||||
cookies, err := jar.GetAll()
|
||||
if err != nil {
|
||||
t.Fatalf("GetAll() error: %v", err)
|
||||
}
|
||||
if len(cookies) != 2 {
|
||||
t.Errorf("GetAll() returned %d cookies, want 2", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_Get(t *testing.T) {
|
||||
jar := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
Cookie{Host: "other.com", Path: "/", Name: "b", Value: "2"},
|
||||
}
|
||||
|
||||
cookies, err := jar.Get("https://example.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("Get() error: %v", err)
|
||||
}
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("Get() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
if cookies[0].Name != "a" {
|
||||
t.Errorf("Get() cookie name = %q, want %q", cookies[0].Name, "a")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_Set_New(t *testing.T) {
|
||||
jar := &staticCookieJar{}
|
||||
err := jar.Set(Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"})
|
||||
if err != nil {
|
||||
t.Fatalf("Set() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("after Set, GetAll() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
if cookies[0].Value != "1" {
|
||||
t.Errorf("cookie value = %q, want %q", cookies[0].Value, "1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_Set_Update(t *testing.T) {
|
||||
jar := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
err := jar.Set(Cookie{Host: "example.com", Path: "/", Name: "a", Value: "2"})
|
||||
if err != nil {
|
||||
t.Fatalf("Set() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("after update Set, GetAll() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
if cookies[0].Value != "2" {
|
||||
t.Errorf("cookie value = %q, want %q", cookies[0].Value, "2")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_Delete(t *testing.T) {
|
||||
jar := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
Cookie{Host: "other.com", Path: "/", Name: "b", Value: "2"},
|
||||
}
|
||||
err := jar.Delete(Cookie{Host: "example.com", Path: "/", Name: "a"})
|
||||
if err != nil {
|
||||
t.Fatalf("Delete() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("after Delete, GetAll() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
if cookies[0].Name != "b" {
|
||||
t.Errorf("remaining cookie name = %q, want %q", cookies[0].Name, "b")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaticCookieJar_Delete_NotFound(t *testing.T) {
|
||||
jar := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
err := jar.Delete(Cookie{Host: "nonexistent.com", Path: "/", Name: "x"})
|
||||
if err != nil {
|
||||
t.Fatalf("Delete() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("after no-op Delete, GetAll() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadOnlyCookieJar_SetIsNoop(t *testing.T) {
|
||||
inner := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
ro := ReadOnlyCookieJar{Jar: inner}
|
||||
|
||||
err := ro.Set(Cookie{Host: "example.com", Path: "/", Name: "new", Value: "val"})
|
||||
if err != nil {
|
||||
t.Fatalf("Set() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := inner.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("ReadOnlyCookieJar.Set should be noop, but inner jar has %d cookies", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadOnlyCookieJar_DeleteIsNoop(t *testing.T) {
|
||||
inner := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
ro := ReadOnlyCookieJar{Jar: inner}
|
||||
|
||||
err := ro.Delete(Cookie{Host: "example.com", Path: "/", Name: "a"})
|
||||
if err != nil {
|
||||
t.Fatalf("Delete() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := inner.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("ReadOnlyCookieJar.Delete should be noop, but inner jar has %d cookies", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadOnlyCookieJar_GetAll(t *testing.T) {
|
||||
inner := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
ro := ReadOnlyCookieJar{Jar: inner}
|
||||
|
||||
cookies, err := ro.GetAll()
|
||||
if err != nil {
|
||||
t.Fatalf("GetAll() error: %v", err)
|
||||
}
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("ReadOnlyCookieJar.GetAll() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadOnlyCookieJar_Get(t *testing.T) {
|
||||
inner := &staticCookieJar{
|
||||
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||
}
|
||||
ro := ReadOnlyCookieJar{Jar: inner}
|
||||
|
||||
cookies, err := ro.Get("https://example.com/page")
|
||||
if err != nil {
|
||||
t.Fatalf("Get() error: %v", err)
|
||||
}
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("ReadOnlyCookieJar.Get() returned %d cookies, want 1", len(cookies))
|
||||
}
|
||||
}
|
||||
189
cookies_txt_test.go
Normal file
189
cookies_txt_test.go
Normal file
@@ -0,0 +1,189 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func writeTempCookieFile(t *testing.T, content string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "cookies.txt")
|
||||
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||
t.Fatalf("failed to write temp cookie file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_Valid(t *testing.T) {
|
||||
content := ".example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("expected 1 cookie, got %d", len(cookies))
|
||||
}
|
||||
|
||||
c := cookies[0]
|
||||
if c.Host != ".example.com" {
|
||||
t.Errorf("Host = %q, want %q", c.Host, ".example.com")
|
||||
}
|
||||
if !c.HttpOnly {
|
||||
t.Error("HttpOnly = false, want true")
|
||||
}
|
||||
if c.Path != "/" {
|
||||
t.Errorf("Path = %q, want %q", c.Path, "/")
|
||||
}
|
||||
if c.Secure {
|
||||
t.Error("Secure = true, want false")
|
||||
}
|
||||
if c.Name != "session" {
|
||||
t.Errorf("Name = %q, want %q", c.Name, "session")
|
||||
}
|
||||
if c.Value != "abc123" {
|
||||
t.Errorf("Value = %q, want %q", c.Value, "abc123")
|
||||
}
|
||||
if c.Expires.Unix() != 1700000000 {
|
||||
t.Errorf("Expires = %d, want 1700000000", c.Expires.Unix())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_Comments(t *testing.T) {
|
||||
content := "# This is a comment\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("expected 1 cookie (comment skipped), got %d", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_EmptyLines(t *testing.T) {
|
||||
content := "\n\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("expected 1 cookie (empty lines skipped), got %d", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_ShortLines(t *testing.T) {
|
||||
content := "too\tfew\tfields\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Errorf("expected 1 cookie (short line skipped), got %d", len(cookies))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_InvalidExpiry(t *testing.T) {
|
||||
content := ".example.com\tTRUE\t/\tFALSE\tnotanumber\tsession\tabc123\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 1 {
|
||||
t.Fatalf("expected 1 cookie, got %d", len(cookies))
|
||||
}
|
||||
|
||||
// Should have a default expiry ~180 days from now
|
||||
now := time.Now()
|
||||
expected := now.Add(180 * 24 * time.Hour)
|
||||
diff := cookies[0].Expires.Sub(expected)
|
||||
if diff < -time.Minute || diff > time.Minute {
|
||||
t.Errorf("invalid expiry default: got %v, expected ~%v", cookies[0].Expires, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_HttpOnly(t *testing.T) {
|
||||
content := ".example.com\tTRUE\t/\tFALSE\t1700000000\ta\t1\n.other.com\tFALSE\t/\tFALSE\t1700000000\tb\t2\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 2 {
|
||||
t.Fatalf("expected 2 cookies, got %d", len(cookies))
|
||||
}
|
||||
|
||||
if !cookies[0].HttpOnly {
|
||||
t.Error("first cookie HttpOnly = false, want true")
|
||||
}
|
||||
if cookies[1].HttpOnly {
|
||||
t.Error("second cookie HttpOnly = true, want false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_Secure(t *testing.T) {
|
||||
content := ".example.com\tFALSE\t/\tTRUE\t1700000000\ta\t1\n.other.com\tFALSE\t/\tFALSE\t1700000000\tb\t2\n"
|
||||
path := writeTempCookieFile(t, content)
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 2 {
|
||||
t.Fatalf("expected 2 cookies, got %d", len(cookies))
|
||||
}
|
||||
|
||||
if !cookies[0].Secure {
|
||||
t.Error("first cookie Secure = false, want true")
|
||||
}
|
||||
if cookies[1].Secure {
|
||||
t.Error("second cookie Secure = true, want false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_NonexistentFile(t *testing.T) {
|
||||
_, err := LoadCookiesFile("/nonexistent/path/cookies.txt")
|
||||
if err == nil {
|
||||
t.Error("expected error for nonexistent file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadCookiesFile_Empty(t *testing.T) {
|
||||
path := writeTempCookieFile(t, "")
|
||||
|
||||
jar, err := LoadCookiesFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||
}
|
||||
|
||||
cookies, _ := jar.GetAll()
|
||||
if len(cookies) != 0 {
|
||||
t.Errorf("expected 0 cookies from empty file, got %d", len(cookies))
|
||||
}
|
||||
}
|
||||
28
document.go
28
document.go
@@ -25,30 +25,21 @@ type document struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
page playwright.Page
|
||||
root playwright.ElementHandle
|
||||
locator playwright.Locator
|
||||
}
|
||||
|
||||
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
||||
root, err := page.QuerySelector("html")
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
root2 := page.Locator("html")
|
||||
locator := page.Locator("html")
|
||||
|
||||
res := &document{
|
||||
node: node{
|
||||
locator: root2,
|
||||
locator: locator,
|
||||
},
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
page: page,
|
||||
root: root,
|
||||
}
|
||||
|
||||
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
|
||||
slog.Info("new document", "url", page.URL(), "locator", locator)
|
||||
|
||||
return res, nil
|
||||
}
|
||||
@@ -78,21 +69,14 @@ func (d *document) Refresh() error {
|
||||
}
|
||||
|
||||
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
||||
|
||||
var f *float64 = nil
|
||||
if timeout == nil {
|
||||
t := 30 * time.Second
|
||||
timeout = &t
|
||||
}
|
||||
|
||||
if timeout != nil {
|
||||
ms := float64(timeout.Milliseconds())
|
||||
f = &ms
|
||||
}
|
||||
|
||||
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
ms := float64(timeout.Milliseconds())
|
||||
return d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
Timeout: f,
|
||||
Timeout: &ms,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
10
go.mod
10
go.mod
@@ -1,12 +1,14 @@
|
||||
module gitea.stevedudenhoeffer.com/steve/go-extractor
|
||||
|
||||
go 1.23.2
|
||||
go 1.24.0
|
||||
|
||||
toolchain go1.24.1
|
||||
|
||||
require (
|
||||
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
|
||||
github.com/playwright-community/playwright-go v0.5001.0
|
||||
github.com/playwright-community/playwright-go v0.5200.0
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||
golang.org/x/text v0.23.0
|
||||
golang.org/x/text v0.29.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -17,5 +19,5 @@ require (
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
golang.org/x/net v0.37.0 // indirect
|
||||
golang.org/x/net v0.44.0 // indirect
|
||||
)
|
||||
|
||||
110
go.sum
Normal file
110
go.sum
Normal file
@@ -0,0 +1,110 @@
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+ZlfuyaAdFlQ=
|
||||
github.com/deckarep/golang-set/v2 v2.8.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
|
||||
github.com/go-jose/go-jose/v3 v3.0.4 h1:Wp5HA7bLQcKnf6YYao/4kpRpVMp/yf6+pJKV8WFSaNY=
|
||||
github.com/go-jose/go-jose/v3 v3.0.4/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
|
||||
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 h1:BYLNYdZaepitbZreRIa9xeCQZocWmy/wj4cGIH0qyw0=
|
||||
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612/go.mod h1:wgqthQa8SAYs0yyljVeCOQlZ027VW5CmLsbi9jWC08c=
|
||||
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
|
||||
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
|
||||
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
|
||||
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
|
||||
github.com/playwright-community/playwright-go v0.5200.0 h1:z/5LGuX2tBrg3ug1HupMXLjIG93f1d2MWdDsNhkMQ9c=
|
||||
github.com/playwright-community/playwright-go v0.5200.0/go.mod h1:UnnyQZaqUOO5ywAZu60+N4EiWReUqX1MQBBA3Oofvf8=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
|
||||
github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
|
||||
github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1 h1:6DTaaUarcM0wX7qj5Hcvs+5Dm3dyUTBbEwIWAjcw9Zg=
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1/go.mod h1:FnIeEMYu+ko8zP1F9Ypr3xkZMIDqW3DR92yUtY39q1Y=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
|
||||
golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
|
||||
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
222
interactive.go
Normal file
222
interactive.go
Normal file
@@ -0,0 +1,222 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/playwright-community/playwright-go"
|
||||
)
|
||||
|
||||
// InteractiveBrowser provides low-level page control for interactive browser sessions.
|
||||
// Unlike Browser which is designed for scraping, InteractiveBrowser exposes mouse, keyboard,
|
||||
// screenshot, and navigation APIs suitable for remote browser control.
|
||||
type InteractiveBrowser interface {
|
||||
// Navigate goes to the given URL and returns the final URL after any redirects.
|
||||
Navigate(url string) (string, error)
|
||||
// GoBack navigates back in history. Returns the final URL.
|
||||
GoBack() (string, error)
|
||||
// GoForward navigates forward in history. Returns the final URL.
|
||||
GoForward() (string, error)
|
||||
// URL returns the current page URL.
|
||||
URL() string
|
||||
|
||||
// MouseClick clicks at the given coordinates with the specified button ("left", "middle", "right").
|
||||
MouseClick(x, y float64, button string) error
|
||||
// MouseMove moves the mouse to the given coordinates.
|
||||
MouseMove(x, y float64) error
|
||||
// MouseWheel scrolls by the given delta.
|
||||
MouseWheel(deltaX, deltaY float64) error
|
||||
|
||||
// KeyboardType types the given text as if it were entered character by character.
|
||||
KeyboardType(text string) error
|
||||
// KeyboardPress presses a special key (e.g. "Enter", "Tab", "Backspace").
|
||||
KeyboardPress(key string) error
|
||||
// KeyboardInsertText inserts text directly into the focused element by dispatching
|
||||
// only an input event (no keydown, keypress, or keyup). This is more reliable than
|
||||
// KeyboardType for pasting into password fields and custom input components.
|
||||
KeyboardInsertText(text string) error
|
||||
|
||||
// Screenshot takes a full-page screenshot as JPEG with the given quality (0-100).
|
||||
Screenshot(quality int) ([]byte, error)
|
||||
|
||||
// Cookies returns all cookies from the browser context.
|
||||
Cookies() ([]Cookie, error)
|
||||
|
||||
// Close tears down the browser.
|
||||
Close() error
|
||||
}
|
||||
|
||||
type interactiveBrowser struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
ctx playwright.BrowserContext
|
||||
page playwright.Page
|
||||
}
|
||||
|
||||
// NewInteractiveBrowser creates a headless browser with a page ready for interactive control.
|
||||
// The context is only used for cancellation during setup.
|
||||
func NewInteractiveBrowser(ctx context.Context, opts ...BrowserOptions) (InteractiveBrowser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := mergeOptions(BrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
Browser: BrowserChromium,
|
||||
Timeout: &thirtySeconds,
|
||||
Dimensions: Size{
|
||||
Width: 1280,
|
||||
Height: 720,
|
||||
},
|
||||
}, opts)
|
||||
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
type result struct {
|
||||
ib InteractiveBrowser
|
||||
err error
|
||||
}
|
||||
|
||||
ch := make(chan result, 1)
|
||||
|
||||
go func() {
|
||||
res, err := initBrowser(opt)
|
||||
if err != nil {
|
||||
ch <- result{nil, err}
|
||||
return
|
||||
}
|
||||
|
||||
page, err := res.bctx.NewPage()
|
||||
if err != nil {
|
||||
ch <- result{nil, fmt.Errorf("failed to create page: %w", err)}
|
||||
return
|
||||
}
|
||||
|
||||
ch <- result{
|
||||
ib: &interactiveBrowser{
|
||||
pw: res.pw,
|
||||
browser: res.browser,
|
||||
ctx: res.bctx,
|
||||
page: page,
|
||||
},
|
||||
}
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case r := <-ch:
|
||||
return r.ib, r.err
|
||||
}
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) Navigate(url string) (string, error) {
|
||||
_, err := ib.page.Goto(url, playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateLoad,
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("navigation failed: %w", err)
|
||||
}
|
||||
return ib.page.URL(), nil
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) GoBack() (string, error) {
|
||||
_, err := ib.page.GoBack()
|
||||
if err != nil {
|
||||
return ib.page.URL(), fmt.Errorf("go back failed: %w", err)
|
||||
}
|
||||
return ib.page.URL(), nil
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) GoForward() (string, error) {
|
||||
_, err := ib.page.GoForward()
|
||||
if err != nil {
|
||||
return ib.page.URL(), fmt.Errorf("go forward failed: %w", err)
|
||||
}
|
||||
return ib.page.URL(), nil
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) URL() string {
|
||||
return ib.page.URL()
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) MouseClick(x, y float64, button string) error {
|
||||
var btn *playwright.MouseButton
|
||||
switch button {
|
||||
case "right":
|
||||
btn = playwright.MouseButtonRight
|
||||
case "middle":
|
||||
btn = playwright.MouseButtonMiddle
|
||||
default:
|
||||
btn = playwright.MouseButtonLeft
|
||||
}
|
||||
return ib.page.Mouse().Click(x, y, playwright.MouseClickOptions{Button: btn})
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) MouseMove(x, y float64) error {
|
||||
return ib.page.Mouse().Move(x, y)
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) MouseWheel(deltaX, deltaY float64) error {
|
||||
return ib.page.Mouse().Wheel(deltaX, deltaY)
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) KeyboardType(text string) error {
|
||||
return ib.page.Keyboard().Type(text)
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) KeyboardPress(key string) error {
|
||||
return ib.page.Keyboard().Press(key)
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) KeyboardInsertText(text string) error {
|
||||
return ib.page.Keyboard().InsertText(text)
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) Screenshot(quality int) ([]byte, error) {
|
||||
return ib.page.Screenshot(playwright.PageScreenshotOptions{
|
||||
Type: playwright.ScreenshotTypeJpeg,
|
||||
Quality: playwright.Int(quality),
|
||||
})
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) Cookies() ([]Cookie, error) {
|
||||
pwCookies, err := ib.ctx.Cookies()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cookies: %w", err)
|
||||
}
|
||||
|
||||
cookies := make([]Cookie, len(pwCookies))
|
||||
for i, c := range pwCookies {
|
||||
cookies[i] = playwrightCookieToCookie(c)
|
||||
}
|
||||
return cookies, nil
|
||||
}
|
||||
|
||||
func (ib *interactiveBrowser) Close() error {
|
||||
var errs []error
|
||||
if ib.page != nil {
|
||||
if err := ib.page.Close(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
if ib.ctx != nil {
|
||||
if err := ib.ctx.Close(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
if ib.browser != nil {
|
||||
if err := ib.browser.Close(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
if ib.pw != nil {
|
||||
if err := ib.pw.Stop(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
if len(errs) > 0 {
|
||||
return fmt.Errorf("errors during close: %v", errs)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
16
mock_test.go
Normal file
16
mock_test.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package extractor
|
||||
|
||||
import "time"
|
||||
|
||||
// mockDocument implements the Document interface for testing without Playwright.
|
||||
type mockDocument struct {
|
||||
mockNode
|
||||
url string
|
||||
content string
|
||||
}
|
||||
|
||||
func (m mockDocument) URL() string { return m.url }
|
||||
func (m mockDocument) Refresh() error { return nil }
|
||||
func (m mockDocument) Content() (string, error) { return m.content, nil }
|
||||
func (m mockDocument) Close() error { return nil }
|
||||
func (m mockDocument) WaitForNetworkIdle(_ *time.Duration) error { return nil }
|
||||
23
node_test.go
Normal file
23
node_test.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package extractor
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestEscapeJavaScript(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"hello", "hello"},
|
||||
{"it's", "it\\'s"},
|
||||
{`back\slash`, `back\\slash`},
|
||||
{`both\'`, `both\\\'`},
|
||||
{"", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := escapeJavaScript(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("escapeJavaScript(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
3
nodes.go
3
nodes.go
@@ -13,6 +13,9 @@ func (n Nodes) Select(selector string) Nodes {
|
||||
}
|
||||
|
||||
func (d Nodes) First() Node {
|
||||
if len(d) == 0 {
|
||||
return nil
|
||||
}
|
||||
return d[0]
|
||||
}
|
||||
|
||||
|
||||
111
nodes_test.go
Normal file
111
nodes_test.go
Normal file
@@ -0,0 +1,111 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mockNode implements the Node interface for testing.
|
||||
type mockNode struct {
|
||||
text string
|
||||
textErr error
|
||||
content string
|
||||
children Nodes
|
||||
}
|
||||
|
||||
func (m mockNode) Content() (string, error) { return m.content, nil }
|
||||
func (m mockNode) Text() (string, error) { return m.text, m.textErr }
|
||||
func (m mockNode) Attr(_ string) (string, error) { return "", nil }
|
||||
func (m mockNode) Screenshot() ([]byte, error) { return nil, nil }
|
||||
func (m mockNode) Type(_ string) error { return nil }
|
||||
func (m mockNode) Click() error { return nil }
|
||||
func (m mockNode) Select(_ string) Nodes { return m.children }
|
||||
func (m mockNode) SelectFirst(_ string) Node { return m.children.First() }
|
||||
func (m mockNode) ForEach(_ string, _ func(Node) error) error { return nil }
|
||||
func (m mockNode) SetHidden(_ bool) error { return nil }
|
||||
func (m mockNode) SetAttribute(_, _ string) error { return nil }
|
||||
|
||||
func TestNodes_First_Empty(t *testing.T) {
|
||||
var nodes Nodes
|
||||
got := nodes.First()
|
||||
if got != nil {
|
||||
t.Errorf("First() on empty Nodes = %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_First_NonEmpty(t *testing.T) {
|
||||
n1 := mockNode{text: "first"}
|
||||
n2 := mockNode{text: "second"}
|
||||
nodes := Nodes{n1, n2}
|
||||
|
||||
got := nodes.First()
|
||||
if got == nil {
|
||||
t.Fatal("First() on non-empty Nodes returned nil")
|
||||
}
|
||||
|
||||
text, _ := got.Text()
|
||||
if text != "first" {
|
||||
t.Errorf("First().Text() = %q, want %q", text, "first")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_Select(t *testing.T) {
|
||||
child1 := mockNode{text: "child1"}
|
||||
child2 := mockNode{text: "child2"}
|
||||
child3 := mockNode{text: "child3"}
|
||||
|
||||
n1 := mockNode{children: Nodes{child1, child2}}
|
||||
n2 := mockNode{children: Nodes{child3}}
|
||||
|
||||
nodes := Nodes{n1, n2}
|
||||
result := nodes.Select("anything")
|
||||
|
||||
if len(result) != 3 {
|
||||
t.Errorf("Select() returned %d nodes, want 3", len(result))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_Select_Empty(t *testing.T) {
|
||||
var nodes Nodes
|
||||
result := nodes.Select("anything")
|
||||
if len(result) != 0 {
|
||||
t.Errorf("Select() on empty Nodes returned %d nodes, want 0", len(result))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_ExtractText(t *testing.T) {
|
||||
n1 := mockNode{text: "hello"}
|
||||
n2 := mockNode{text: "world"}
|
||||
nodes := Nodes{n1, n2}
|
||||
|
||||
texts, err := nodes.ExtractText()
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractText() error = %v", err)
|
||||
}
|
||||
|
||||
if len(texts) != 2 || texts[0] != "hello" || texts[1] != "world" {
|
||||
t.Errorf("ExtractText() = %v, want [hello world]", texts)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_ExtractText_Error(t *testing.T) {
|
||||
n1 := mockNode{text: "hello"}
|
||||
n2 := mockNode{textErr: fmt.Errorf("text error")}
|
||||
nodes := Nodes{n1, n2}
|
||||
|
||||
_, err := nodes.ExtractText()
|
||||
if err == nil {
|
||||
t.Fatal("ExtractText() expected error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodes_ExtractText_Empty(t *testing.T) {
|
||||
var nodes Nodes
|
||||
texts, err := nodes.ExtractText()
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractText() error = %v", err)
|
||||
}
|
||||
if len(texts) != 0 {
|
||||
t.Errorf("ExtractText() on empty = %v, want empty", texts)
|
||||
}
|
||||
}
|
||||
189
playwright.go
189
playwright.go
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
@@ -12,17 +11,18 @@ import (
|
||||
)
|
||||
|
||||
type playWrightBrowser struct {
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
ctx playwright.BrowserContext
|
||||
userAgent string
|
||||
timeout time.Duration
|
||||
cookieJar CookieJar
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
ctx playwright.BrowserContext
|
||||
userAgent string
|
||||
timeout time.Duration
|
||||
cookieJar CookieJar
|
||||
serverAddr string
|
||||
}
|
||||
|
||||
var _ Browser = playWrightBrowser{}
|
||||
|
||||
type PlayWrightBrowserSelection string
|
||||
type BrowserSelection string
|
||||
|
||||
var (
|
||||
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
||||
@@ -31,18 +31,18 @@ var (
|
||||
)
|
||||
|
||||
const (
|
||||
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
||||
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
||||
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
||||
BrowserChromium BrowserSelection = "chromium"
|
||||
BrowserFirefox BrowserSelection = "firefox"
|
||||
BrowserWebKit BrowserSelection = "webkit"
|
||||
)
|
||||
|
||||
type Size struct {
|
||||
Width int
|
||||
Height int
|
||||
}
|
||||
type PlayWrightBrowserOptions struct {
|
||||
type BrowserOptions struct {
|
||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
||||
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
||||
Browser BrowserSelection // If unset defaults to Firefox.
|
||||
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
||||
|
||||
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
||||
@@ -53,6 +53,17 @@ type PlayWrightBrowserOptions struct {
|
||||
|
||||
Dimensions Size
|
||||
DarkMode bool
|
||||
|
||||
// ServerAddress is the address of a Playwright server to connect to.
|
||||
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
|
||||
ServerAddress string
|
||||
|
||||
// RequireServer will, if set, return an error if the connection to the
|
||||
// Playwright server fails instead of falling back to a local browser launch.
|
||||
RequireServer bool
|
||||
|
||||
// UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser.
|
||||
UseLocalOnly bool
|
||||
}
|
||||
|
||||
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
||||
@@ -77,129 +88,51 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
||||
}
|
||||
}
|
||||
|
||||
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
||||
func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) {
|
||||
var thirtySeconds = 30 * time.Second
|
||||
opt := PlayWrightBrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:137.0) Gecko/20100101 Firefox/137.0",
|
||||
Browser: PlayWrightBrowserSelectionFirefox,
|
||||
opt := mergeOptions(BrowserOptions{
|
||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
Browser: BrowserFirefox,
|
||||
Timeout: &thirtySeconds,
|
||||
DarkMode: false,
|
||||
}
|
||||
}, opts)
|
||||
|
||||
for _, o := range opts {
|
||||
if o.UserAgent != "" {
|
||||
opt.UserAgent = o.UserAgent
|
||||
}
|
||||
if o.Browser != "" {
|
||||
opt.Browser = o.Browser
|
||||
}
|
||||
if o.Timeout != nil {
|
||||
opt.Timeout = o.Timeout
|
||||
}
|
||||
if o.CookieJar != nil {
|
||||
opt.CookieJar = o.CookieJar
|
||||
}
|
||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||
opt.Dimensions = o.Dimensions
|
||||
}
|
||||
if o.DarkMode {
|
||||
opt.DarkMode = true
|
||||
}
|
||||
|
||||
opt.ShowBrowser = o.ShowBrowser
|
||||
}
|
||||
|
||||
pw, err := playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
err = playwright.Install()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pw, err = playwright.Run()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var bt playwright.BrowserType
|
||||
|
||||
switch opt.Browser {
|
||||
case PlayWrightBrowserSelectionChromium:
|
||||
bt = pw.Chromium
|
||||
|
||||
case PlayWrightBrowserSelectionFirefox:
|
||||
bt = pw.Firefox
|
||||
|
||||
case PlayWrightBrowserSelectionWebKit:
|
||||
bt = pw.WebKit
|
||||
|
||||
default:
|
||||
return nil, ErrInvalidBrowserSelection
|
||||
}
|
||||
|
||||
browser, err := bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||
})
|
||||
if err != nil {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var viewport *playwright.Size
|
||||
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||
viewport = &playwright.Size{
|
||||
Width: opt.Dimensions.Width,
|
||||
Height: opt.Dimensions.Height,
|
||||
}
|
||||
type browserResult struct {
|
||||
browser Browser
|
||||
err error
|
||||
}
|
||||
|
||||
var scheme *playwright.ColorScheme
|
||||
resultCh := make(chan browserResult, 1)
|
||||
|
||||
if opt.DarkMode {
|
||||
scheme = playwright.ColorSchemeDark
|
||||
} else {
|
||||
scheme = playwright.ColorSchemeNoPreference
|
||||
}
|
||||
|
||||
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||
UserAgent: playwright.String(opt.UserAgent),
|
||||
Viewport: viewport,
|
||||
ColorScheme: scheme,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if opt.CookieJar != nil {
|
||||
cookies, err := opt.CookieJar.GetAll()
|
||||
go func() {
|
||||
res, err := initBrowser(opt)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
||||
resultCh <- browserResult{nil, err}
|
||||
return
|
||||
}
|
||||
|
||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||
|
||||
for i, cookie := range cookies {
|
||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
||||
resultCh <- browserResult{
|
||||
browser: playWrightBrowser{
|
||||
pw: res.pw,
|
||||
browser: res.browser,
|
||||
userAgent: res.opt.UserAgent,
|
||||
timeout: *res.opt.Timeout,
|
||||
cookieJar: res.opt.CookieJar,
|
||||
ctx: res.bctx,
|
||||
serverAddr: res.opt.ServerAddress,
|
||||
},
|
||||
}
|
||||
}()
|
||||
|
||||
err = c.AddCookies(pwCookies)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case result := <-resultCh:
|
||||
return result.browser, result.err
|
||||
}
|
||||
|
||||
return playWrightBrowser{
|
||||
pw: pw,
|
||||
browser: browser,
|
||||
userAgent: opt.UserAgent,
|
||||
timeout: *opt.Timeout,
|
||||
cookieJar: opt.CookieJar,
|
||||
ctx: c,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
||||
@@ -245,10 +178,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
|
||||
|
||||
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||
|
||||
if resp.Status() != 200 {
|
||||
time.Sleep(999 * time.Hour * 24)
|
||||
time.Sleep(25 * time.Second)
|
||||
|
||||
if resp.Status() < 200 || resp.Status() >= 300 {
|
||||
_ = page.Close()
|
||||
|
||||
if resp.Status() == 404 {
|
||||
@@ -284,12 +214,9 @@ func (b playWrightBrowser) Close() error {
|
||||
)
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
_ = cl.Close()
|
||||
}
|
||||
|
||||
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
||||
browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{
|
||||
browser, err := NewBrowser(ctx, BrowserOptions{
|
||||
Timeout: &timeout,
|
||||
})
|
||||
|
||||
@@ -297,14 +224,14 @@ func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]by
|
||||
return nil, fmt.Errorf("error creating browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(browser)
|
||||
defer DeferClose(browser)
|
||||
|
||||
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error opening page: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer DeferClose(doc)
|
||||
|
||||
return doc.Screenshot()
|
||||
}
|
||||
|
||||
72
readability_test.go
Normal file
72
readability_test.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReadability_ValidHTML(t *testing.T) {
|
||||
html := `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Article</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Test Article</h1>
|
||||
<p>This is a test article with enough content to be parsed by readability.
|
||||
It needs to have a reasonable amount of text so the algorithm considers it
|
||||
a valid article. Let us add several sentences to make sure this works
|
||||
correctly. The readability library requires a minimum amount of content
|
||||
to successfully extract an article from a page.</p>
|
||||
<p>Here is another paragraph to add more content. We want to make sure
|
||||
that the content is substantial enough for the readability algorithm to
|
||||
consider this a valid article and extract the text properly.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/article",
|
||||
content: html,
|
||||
}
|
||||
|
||||
article, err := Readability(context.Background(), doc)
|
||||
if err != nil {
|
||||
t.Fatalf("Readability() error = %v", err)
|
||||
}
|
||||
|
||||
if article.Title != "Test Article" {
|
||||
t.Errorf("Title = %q, want %q", article.Title, "Test Article")
|
||||
}
|
||||
|
||||
if article.TextContent == "" {
|
||||
t.Error("TextContent should not be empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadability_EmptyContent(t *testing.T) {
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/empty",
|
||||
content: "",
|
||||
}
|
||||
|
||||
article, err := Readability(context.Background(), doc)
|
||||
if err != nil {
|
||||
t.Fatalf("Readability() unexpected error = %v", err)
|
||||
}
|
||||
// Empty content should produce an empty article.
|
||||
if article.Title != "" && article.TextContent != "" {
|
||||
t.Error("expected empty article from empty content")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadability_InvalidURL(t *testing.T) {
|
||||
doc := mockDocument{
|
||||
url: "://invalid",
|
||||
content: "<html><body><p>text</p></body></html>",
|
||||
}
|
||||
|
||||
_, err := Readability(context.Background(), doc)
|
||||
if err == nil {
|
||||
t.Error("Readability() expected error for invalid URL, got nil")
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -27,11 +26,6 @@ type Item struct {
|
||||
Price float64
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
||||
}
|
||||
@@ -57,18 +51,18 @@ func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.
|
||||
res.ID, _ = strconv.Atoi(a[3])
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to open page: %w", err)
|
||||
}
|
||||
|
||||
names := doc.Select("h2.h4")
|
||||
names := doc.Select(".h4")
|
||||
|
||||
if len(names) > 0 {
|
||||
res.Name, _ = names[0].Text()
|
||||
}
|
||||
|
||||
prices := doc.Select("h4.h2")
|
||||
prices := doc.Select(".h2")
|
||||
|
||||
if len(prices) > 0 {
|
||||
priceStr, _ := prices[0].Text()
|
||||
|
||||
39
sites/aislegopher/aislegopher_test.go
Normal file
39
sites/aislegopher/aislegopher_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package aislegopher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetItemFromURL_InvalidHost(t *testing.T) {
|
||||
u, _ := url.Parse("https://example.com/p/slug/123")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_NoP(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/x/slug/123")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_TooShort(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/p/slug")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemFromURL_InvalidPath_TooLong(t *testing.T) {
|
||||
u, _ := url.Parse("https://aislegopher.com/p/slug/123/extra")
|
||||
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||
if err != ErrInvalidURL {
|
||||
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||
}
|
||||
}
|
||||
@@ -3,10 +3,10 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -22,11 +22,6 @@ func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
flags = append(flags, browser.Flags...)
|
||||
@@ -44,7 +39,7 @@ func main() {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
arg := c.Args().First()
|
||||
|
||||
@@ -72,6 +67,7 @@ func main() {
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strings"
|
||||
@@ -39,12 +38,6 @@ func (c Config) validate() Config {
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||
c = c.validate()
|
||||
@@ -130,10 +123,9 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
fmt.Println("context already done before entering the loop:", ctx.Err())
|
||||
slog.Debug("context already done before entering the loop", "err", ctx.Err())
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
fmt.Println("context not done yet")
|
||||
// Proceed with the loop
|
||||
}
|
||||
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
||||
@@ -141,6 +133,9 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
||||
// if the page path starts with /wip/ then we are still waiting
|
||||
// also periodically refresh the page just in case
|
||||
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
select {
|
||||
@@ -148,14 +143,14 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
||||
slog.Info("context done")
|
||||
keepGoing = false
|
||||
|
||||
case <-time.NewTicker(5 * time.Second).C:
|
||||
case <-ticker.C:
|
||||
archivedUrl, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Println("checking url:", archivedUrl.String())
|
||||
slog.Debug("checking url", "url", archivedUrl.String())
|
||||
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
||||
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
||||
keepGoing = false
|
||||
|
||||
37
sites/archive/archive_test.go
Normal file
37
sites/archive/archive_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package archive
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.ph" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
|
||||
}
|
||||
if c.Timeout == nil {
|
||||
t.Fatal("Timeout should not be nil after validate")
|
||||
}
|
||||
if *c.Timeout != 1*time.Hour {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||
timeout := 5 * time.Minute
|
||||
c := Config{
|
||||
Endpoint: "https://archive.org",
|
||||
Timeout: &timeout,
|
||||
}
|
||||
c = c.validate()
|
||||
|
||||
if c.Endpoint != "https://archive.org" {
|
||||
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
|
||||
}
|
||||
if *c.Timeout != 5*time.Minute {
|
||||
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,7 @@ var Flags = ArchiveFlags{
|
||||
},
|
||||
}
|
||||
|
||||
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
|
||||
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) (archive.Config, error) {
|
||||
c := archive.DefaultConfig
|
||||
|
||||
if e := cmd.String("endpoint"); e != "" {
|
||||
@@ -38,12 +38,12 @@ func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Conf
|
||||
if t := cmd.String("timeout"); t != "" {
|
||||
d, err := time.ParseDuration(t)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return c, fmt.Errorf("invalid timeout duration: %w", err)
|
||||
}
|
||||
c.Timeout = &d
|
||||
}
|
||||
|
||||
return c
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
@@ -122,7 +122,8 @@ func main() {
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,12 +3,13 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/urfave/cli/v3"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||
)
|
||||
@@ -26,7 +27,7 @@ var Flags = DuckDuckGoFlags{
|
||||
},
|
||||
}
|
||||
|
||||
func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) duckduckgo.Config {
|
||||
func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) (duckduckgo.Config, error) {
|
||||
var res = duckduckgo.DefaultConfig
|
||||
|
||||
if r := cmd.String("region"); r != "" {
|
||||
@@ -42,17 +43,11 @@ func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) duckduckgo.Config {
|
||||
case "off":
|
||||
res.SafeSearch = duckduckgo.SafeSearchOff
|
||||
default:
|
||||
panic("invalid safe search value")
|
||||
return res, fmt.Errorf("invalid safe search value: %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
@@ -66,8 +61,10 @@ func main() {
|
||||
Usage: "Search DuckDuckGo",
|
||||
Flags: flags,
|
||||
Action: func(ctx context.Context, command *cli.Command) error {
|
||||
c := Flags.ToConfig(command)
|
||||
defer deferClose(nil)
|
||||
c, err := Flags.ToConfig(command)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
query := strings.TrimSpace(strings.Join(command.Args().Slice(), " "))
|
||||
|
||||
@@ -76,7 +73,7 @@ func main() {
|
||||
}
|
||||
|
||||
b, err := browser.FromCommand(ctx, command)
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create browser: %w", err)
|
||||
@@ -87,7 +84,7 @@ func main() {
|
||||
return fmt.Errorf("failed to open search: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(search)
|
||||
defer extractor.DeferClose(search)
|
||||
|
||||
res := search.GetResults()
|
||||
fmt.Println("Results:", res)
|
||||
@@ -105,9 +102,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package duckduckgo
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
|
||||
@@ -71,12 +70,6 @@ type Result struct {
|
||||
Description string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
|
||||
u := c.ToSearchURL(query)
|
||||
|
||||
@@ -97,7 +90,7 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
||||
|
||||
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
|
||||
116
sites/duckduckgo/duckduckgo_test.go
Normal file
116
sites/duckduckgo/duckduckgo_test.go
Normal file
@@ -0,0 +1,116 @@
|
||||
package duckduckgo
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConfig_ToSearchURL_Default(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("test query")
|
||||
|
||||
if u.Host != "duckduckgo.com" {
|
||||
t.Errorf("Host = %q, want %q", u.Host, "duckduckgo.com")
|
||||
}
|
||||
|
||||
if u.Query().Get("q") != "test query" {
|
||||
t.Errorf("q = %q, want %q", u.Query().Get("q"), "test query")
|
||||
}
|
||||
|
||||
if u.Query().Get("kp") != "-2" {
|
||||
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-2")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_SafeSearchOn(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOn}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Query().Get("kp") != "1" {
|
||||
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_SafeSearchModerate(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchModerate}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Query().Get("kp") != "-1" {
|
||||
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_SafeSearchOff(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Query().Get("kp") != "-2" {
|
||||
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-2")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_WithRegion(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff, Region: "us-en"}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Query().Get("kl") != "us-en" {
|
||||
t.Errorf("kl = %q, want %q", u.Query().Get("kl"), "us-en")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_WithQuery(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("golang testing")
|
||||
|
||||
if u.Query().Get("q") != "golang testing" {
|
||||
t.Errorf("q = %q, want %q", u.Query().Get("q"), "golang testing")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate_DefaultsSafeSearch(t *testing.T) {
|
||||
c := Config{SafeSearch: 0}
|
||||
c = c.validate()
|
||||
|
||||
if c.SafeSearch != SafeSearchOff {
|
||||
t.Errorf("validate() SafeSearch = %d, want %d (SafeSearchOff)", c.SafeSearch, SafeSearchOff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_NoRegion(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Query().Get("kl") != "" {
|
||||
t.Errorf("kl should be empty when no region, got %q", u.Query().Get("kl"))
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_Scheme(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("test")
|
||||
|
||||
if u.Scheme != "https" {
|
||||
t.Errorf("Scheme = %q, want %q", u.Scheme, "https")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_ToSearchURL_SpecialChars(t *testing.T) {
|
||||
c := Config{SafeSearch: SafeSearchOff}
|
||||
u := c.ToSearchURL("go lang & testing")
|
||||
|
||||
if u.Query().Get("q") != "go lang & testing" {
|
||||
t.Errorf("q = %q, want %q", u.Query().Get("q"), "go lang & testing")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResult_ZeroValue(t *testing.T) {
|
||||
var r Result
|
||||
if r.URL != "" || r.Title != "" || r.Description != "" {
|
||||
t.Error("zero-value Result should have empty fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultConfig_SafeSearch(t *testing.T) {
|
||||
if DefaultConfig.SafeSearch != SafeSearchOff {
|
||||
t.Errorf("DefaultConfig.SafeSearch = %d, want %d", DefaultConfig.SafeSearch, SafeSearchOff)
|
||||
}
|
||||
}
|
||||
@@ -3,12 +3,12 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google"
|
||||
)
|
||||
@@ -42,12 +42,6 @@ func (f GoogleFlags) ToConfig(_ context.Context, cmd *cli.Command) google.Config
|
||||
return c
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
var flags []cli.Flag
|
||||
|
||||
@@ -67,7 +61,7 @@ func main() {
|
||||
|
||||
b, err := browser.FromCommand(ctx, cli)
|
||||
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -87,9 +81,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package google
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
@@ -48,23 +47,20 @@ type Result struct {
|
||||
Description string
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
||||
c = c.validate()
|
||||
|
||||
u, err := url.Parse(fmt.Sprintf("https://%s/search?q=%s", c.BaseURL, query))
|
||||
u, err := url.Parse(fmt.Sprintf("https://%s/search", c.BaseURL))
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
vals := u.Query()
|
||||
vals.Set("q", query)
|
||||
|
||||
if c.Language != "" {
|
||||
u.Query().Set("hl", c.Language)
|
||||
vals.Set("hl", c.Language)
|
||||
}
|
||||
|
||||
if c.Country != "" {
|
||||
@@ -84,17 +80,19 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
||||
}
|
||||
|
||||
if country != "" {
|
||||
u.Query().Set("cr", country)
|
||||
vals.Set("cr", country)
|
||||
}
|
||||
}
|
||||
|
||||
u.RawQuery = vals.Encode()
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
var res []Result
|
||||
|
||||
|
||||
39
sites/google/google_test.go
Normal file
39
sites/google/google_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package google
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
|
||||
if c.BaseURL != "google.com" {
|
||||
t.Errorf("BaseURL = %q, want %q", c.BaseURL, "google.com")
|
||||
}
|
||||
if c.Language != "en" {
|
||||
t.Errorf("Language = %q, want %q", c.Language, "en")
|
||||
}
|
||||
if c.Country != "us" {
|
||||
t.Errorf("Country = %q, want %q", c.Country, "us")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||
c := Config{
|
||||
BaseURL: "google.co.uk",
|
||||
Language: "fr",
|
||||
Country: "uk",
|
||||
}
|
||||
c = c.validate()
|
||||
|
||||
if c.BaseURL != "google.co.uk" {
|
||||
t.Errorf("BaseURL = %q, want %q", c.BaseURL, "google.co.uk")
|
||||
}
|
||||
if c.Language != "fr" {
|
||||
t.Errorf("Language = %q, want %q", c.Language, "fr")
|
||||
}
|
||||
if c.Country != "uk" {
|
||||
t.Errorf("Country = %q, want %q", c.Country, "uk")
|
||||
}
|
||||
}
|
||||
@@ -51,10 +51,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package megamillions
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -33,12 +32,6 @@ type NextDrawing struct {
|
||||
Jackpot currency.Amount
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func netTicksToTime(t int64) time.Time {
|
||||
return time.Unix(0, t*100).Add(-621355968000000000)
|
||||
}
|
||||
@@ -64,7 +57,6 @@ func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||
return nil, fmt.Errorf("failed to parse date: %w", err)
|
||||
}
|
||||
|
||||
fmt.Println("ticks", ticks)
|
||||
drawing.Date = netTicksToTime(ticks)
|
||||
|
||||
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
|
||||
@@ -199,23 +191,12 @@ func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, er
|
||||
|
||||
numeric := numericOnly(txt)
|
||||
|
||||
set := false
|
||||
if strings.Contains(txt, "Billion") {
|
||||
amt := currency.USD.Amount(numeric * 1000000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
nextDrawing.Jackpot = currency.USD.Amount(numeric * 1000000000)
|
||||
} else if strings.Contains(txt, "Million") {
|
||||
amt := currency.USD.Amount(numeric * 1000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
nextDrawing.Jackpot = currency.USD.Amount(numeric * 1000000)
|
||||
} else {
|
||||
amt := currency.USD.Amount(numeric)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
}
|
||||
|
||||
if !set {
|
||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
||||
nextDrawing.Jackpot = currency.USD.Amount(numeric)
|
||||
}
|
||||
|
||||
return &nextDrawing, nil
|
||||
@@ -230,7 +211,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
|
||||
73
sites/megamillions/megamillions_test.go
Normal file
73
sites/megamillions/megamillions_test.go
Normal file
@@ -0,0 +1,73 @@
|
||||
package megamillions
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestNetTicksToTime_Consistency(t *testing.T) {
|
||||
// netTicksToTime converts .NET ticks to Go time.
|
||||
// Verify it produces consistent results for the same input.
|
||||
ticks := int64(638396256000000000)
|
||||
t1 := netTicksToTime(ticks)
|
||||
t2 := netTicksToTime(ticks)
|
||||
|
||||
if !t1.Equal(t2) {
|
||||
t.Errorf("netTicksToTime is not consistent: %v != %v", t1, t2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetTicksToTime_Ordering(t *testing.T) {
|
||||
// A larger ticks value should produce a later time.
|
||||
earlier := netTicksToTime(638396256000000000)
|
||||
later := netTicksToTime(638396256100000000) // 10 seconds later in ticks
|
||||
|
||||
if !later.After(earlier) {
|
||||
t.Errorf("expected later ticks to produce later time: %v vs %v", earlier, later)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetTicksToTime_DifferenceIsCorrect(t *testing.T) {
|
||||
// .NET ticks are 100-nanosecond intervals.
|
||||
// 10,000,000 ticks = 1 second.
|
||||
ticks1 := int64(638396256000000000)
|
||||
ticks2 := ticks1 + 10000000 // 1 second later
|
||||
|
||||
t1 := netTicksToTime(ticks1)
|
||||
t2 := netTicksToTime(ticks2)
|
||||
|
||||
diff := t2.Sub(t1)
|
||||
if diff != time.Second {
|
||||
t.Errorf("expected 1 second difference, got %v", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetTicksToTime_NotZero(t *testing.T) {
|
||||
// Verify the function produces a non-zero time for typical ticks values.
|
||||
ticks := int64(638396256000000000)
|
||||
result := netTicksToTime(ticks)
|
||||
|
||||
if result.IsZero() {
|
||||
t.Error("netTicksToTime should not return zero time for valid ticks")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
_ = c // validate is a no-op, just verify no panic
|
||||
}
|
||||
|
||||
func TestDrawing_ZeroValue(t *testing.T) {
|
||||
var d Drawing
|
||||
if d.MegaBall != 0 || d.Megaplier != 0 {
|
||||
t.Error("zero-value Drawing should have zero fields")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||
var nd NextDrawing
|
||||
if nd.Date != "" {
|
||||
t.Error("zero-value NextDrawing should have empty date")
|
||||
}
|
||||
}
|
||||
@@ -51,10 +51,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,14 +3,11 @@ package powerball
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
|
||||
"golang.org/x/text/currency"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
@@ -30,19 +27,28 @@ type Drawing struct {
|
||||
}
|
||||
|
||||
type NextDrawing struct {
|
||||
Date string
|
||||
Jackpot currency.Amount
|
||||
}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
Date string
|
||||
JackpotDollars int
|
||||
}
|
||||
|
||||
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||
var drawing Drawing
|
||||
|
||||
dateNode := doc.SelectFirst("#numbers .title-date")
|
||||
if dateNode == nil {
|
||||
return nil, fmt.Errorf("failed to find date element")
|
||||
}
|
||||
|
||||
dateStr, err := dateNode.Text()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get date text: %w", err)
|
||||
}
|
||||
|
||||
drawing.Date, err = time.Parse("Mon, Jan 2, 2006", dateStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse date %q: %w", dateStr, err)
|
||||
}
|
||||
|
||||
nums := doc.Select("div.game-ball-group div.white-balls")
|
||||
|
||||
if len(nums) != 5 {
|
||||
@@ -163,23 +169,12 @@ func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, er
|
||||
|
||||
numeric := numericOnly(txt)
|
||||
|
||||
set := false
|
||||
if strings.Contains(txt, "Billion") {
|
||||
amt := currency.USD.Amount(numeric * 1000000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
nextDrawing.JackpotDollars = int(numeric * 1000000000)
|
||||
} else if strings.Contains(txt, "Million") {
|
||||
amt := currency.USD.Amount(numeric * 1000000)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
nextDrawing.JackpotDollars = int(numeric * 1000000)
|
||||
} else {
|
||||
amt := currency.USD.Amount(numeric)
|
||||
nextDrawing.Jackpot = amt
|
||||
set = true
|
||||
}
|
||||
|
||||
if !set {
|
||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
||||
nextDrawing.JackpotDollars = int(numeric)
|
||||
}
|
||||
|
||||
return &nextDrawing, nil
|
||||
@@ -194,7 +189,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
d, err := getDrawing(ctx, doc)
|
||||
|
||||
|
||||
34
sites/powerball/powerball_test.go
Normal file
34
sites/powerball/powerball_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package powerball
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestConfig_Validate(t *testing.T) {
|
||||
c := Config{}
|
||||
c = c.validate()
|
||||
// validate is a no-op for powerball Config, just verify it doesn't panic.
|
||||
_ = c
|
||||
}
|
||||
|
||||
func TestDefaultConfig(t *testing.T) {
|
||||
c := DefaultConfig
|
||||
_ = c
|
||||
}
|
||||
|
||||
func TestDrawing_ZeroValue(t *testing.T) {
|
||||
var d Drawing
|
||||
if d.PowerBall != 0 || d.PowerPlay != 0 {
|
||||
t.Error("zero-value Drawing should have zero fields")
|
||||
}
|
||||
for i, n := range d.Numbers {
|
||||
if n != 0 {
|
||||
t.Errorf("Numbers[%d] = %d, want 0", i, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||
var nd NextDrawing
|
||||
if nd.Date != "" || nd.JackpotDollars != 0 {
|
||||
t.Error("zero-value NextDrawing should have empty/zero fields")
|
||||
}
|
||||
}
|
||||
@@ -49,10 +49,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := cli.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -4,8 +4,6 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
@@ -13,12 +11,6 @@ type Config struct{}
|
||||
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.Browser) (string, error) {
|
||||
return DefaultConfig.GetMostCommonDesktopUserAgent(ctx, b)
|
||||
}
|
||||
@@ -30,7 +22,7 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
||||
return "", fmt.Errorf("failed to open useragents.me: %w", err)
|
||||
}
|
||||
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
s := doc.Select("#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea:nth-child(4)")
|
||||
|
||||
text := ""
|
||||
@@ -44,8 +36,6 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
||||
}
|
||||
data := []map[string]any{}
|
||||
|
||||
fmt.Println("text", text)
|
||||
|
||||
err = json.Unmarshal([]byte(text), &data)
|
||||
|
||||
if err != nil {
|
||||
@@ -63,8 +53,12 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
||||
}
|
||||
|
||||
if pct > highestPct {
|
||||
ua, ok := agent["ua"].(string)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
highestPct = pct
|
||||
highestAgent = agent["ua"].(string)
|
||||
highestAgent = ua
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
9
sites/useragents/useragents_test.go
Normal file
9
sites/useragents/useragents_test.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package useragents
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDefaultConfig(t *testing.T) {
|
||||
// DefaultConfig should be a zero-value Config.
|
||||
c := DefaultConfig
|
||||
_ = c // Just verify it exists and is usable.
|
||||
}
|
||||
@@ -3,10 +3,10 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||
|
||||
"github.com/urfave/cli/v3"
|
||||
@@ -14,12 +14,6 @@ import (
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
|
||||
)
|
||||
|
||||
func deferClose(cl io.Closer) {
|
||||
if cl != nil {
|
||||
_ = cl.Close()
|
||||
}
|
||||
}
|
||||
|
||||
type WegmansFlags []cli.Flag
|
||||
|
||||
var Flags = WegmansFlags{}
|
||||
@@ -44,7 +38,7 @@ func main() {
|
||||
cfg := Flags.ToConfig(cmd)
|
||||
|
||||
b, err := browser.FromCommand(ctx, cmd)
|
||||
defer deferClose(b)
|
||||
defer extractor.DeferClose(b)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating browser: %w", err)
|
||||
@@ -73,9 +67,8 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
err := app.Run(context.Background(), os.Args)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := app.Run(context.Background(), os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ package wegmans
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -29,12 +29,6 @@ type Item struct {
|
||||
Unit string
|
||||
}
|
||||
|
||||
func deferClose(c io.Closer) {
|
||||
if c != nil {
|
||||
_ = c.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||
|
||||
if b == nil {
|
||||
@@ -67,7 +61,7 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
||||
}
|
||||
|
||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||
defer deferClose(doc)
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
if err != nil {
|
||||
return Item{}, err
|
||||
@@ -80,23 +74,28 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
||||
ID: id,
|
||||
}
|
||||
|
||||
titles := doc.Select("h1[data-test]")
|
||||
titles := doc.Select("h1[data-testid]")
|
||||
|
||||
if len(titles) != 0 {
|
||||
res.Name, _ = titles[0].Text()
|
||||
}
|
||||
|
||||
prices := doc.Select("span[data-test=\"amount\"] span:nth-child(1)")
|
||||
prices := doc.Select("div.component--product-price:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(2)")
|
||||
|
||||
slog.Info("prices", "len", len(prices))
|
||||
if len(prices) != 0 {
|
||||
priceStr, _ := prices[0].Text()
|
||||
slog.Info("price", "0", prices[0], "text", priceStr)
|
||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
||||
// if there's a "/" in the price, then it's in the format of like "1.99/ea", so split it off
|
||||
priceStr = strings.Split(priceStr, "/")[0]
|
||||
price, _ := strconv.ParseFloat(priceStr, 64)
|
||||
slog.Info("price", "0", prices[0], "text", priceStr, "price", price)
|
||||
res.Price = price
|
||||
}
|
||||
|
||||
unitPrices := doc.Select(`span[data-test="per-unit-price"]`)
|
||||
unitPrices := doc.Select(`div.component--product-price:nth-child(1) span.price-per-unit`)
|
||||
|
||||
if len(unitPrices) != 0 {
|
||||
unitPriceStr, _ := unitPrices[0].Text()
|
||||
@@ -111,8 +110,15 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
||||
if len(units) > 1 {
|
||||
res.Unit = strings.TrimSpace(units[1])
|
||||
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
|
||||
|
||||
// the unit might be like "lb.", so if it ends in a period, then just strip it off
|
||||
if strings.HasSuffix(res.Unit, ".") {
|
||||
res.Unit = strings.TrimSuffix(res.Unit, ".")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("res", "res", res)
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
39
sites/wegmans/wegmans_test.go
Normal file
39
sites/wegmans/wegmans_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package wegmans
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetItemPrice_NilBrowser(t *testing.T) {
|
||||
u, _ := url.Parse("https://shop.wegmans.com/product/24921")
|
||||
_, err := DefaultConfig.GetItemPrice(context.Background(), nil, u)
|
||||
if err != ErrNilBrowser {
|
||||
t.Errorf("expected ErrNilBrowser, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemPrice_NilURL(t *testing.T) {
|
||||
// NilBrowser check comes before NilURL, so we can't test NilURL
|
||||
// independently without a real browser. Verify the error sentinel exists.
|
||||
if ErrNilURL.Error() != "url is nil" {
|
||||
t.Errorf("ErrNilURL = %q, want %q", ErrNilURL.Error(), "url is nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItemPrice_ErrorSentinels(t *testing.T) {
|
||||
if ErrInvalidURL.Error() != "invalid url" {
|
||||
t.Errorf("ErrInvalidURL = %q, want %q", ErrInvalidURL.Error(), "invalid url")
|
||||
}
|
||||
if ErrNilBrowser.Error() != "browser is nil" {
|
||||
t.Errorf("ErrNilBrowser = %q, want %q", ErrNilBrowser.Error(), "browser is nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestItem_ZeroValue(t *testing.T) {
|
||||
var item Item
|
||||
if item.ID != 0 || item.Name != "" || item.Price != 0 || item.UnitPrice != 0 || item.Unit != "" {
|
||||
t.Error("zero-value Item should have empty/zero fields")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user