Compare commits
21 Commits
6f4ca22b6a
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 05ca15b165 | |||
| 294097c3b6 | |||
| 022e002f98 | |||
| 51ce639994 | |||
| cb2ed10cfd | |||
| e7b7e78796 | |||
| e807dbb2ff | |||
| 52a9cb585d | |||
| 868acfae40 | |||
| 82fce5a200 | |||
| 5fe7313fa4 | |||
| 39c2c7d37a | |||
| e32a6fa791 | |||
| afa0238758 | |||
| 9ae8619f93 | |||
| f4caef22b0 | |||
| 9947cae947 | |||
| dc43d1626a | |||
| 2d60940001 | |||
| d0fffb0411 | |||
| 8b4e43c40f |
35
.gitea/workflows/ci.yml
Normal file
35
.gitea/workflows/ci.yml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: https://gitea.com/actions/checkout@v4
|
||||||
|
- uses: https://gitea.com/actions/setup-go@v3
|
||||||
|
with:
|
||||||
|
go-version-file: go.mod
|
||||||
|
- run: go build ./...
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: https://gitea.com/actions/checkout@v4
|
||||||
|
- uses: https://gitea.com/actions/setup-go@v3
|
||||||
|
with:
|
||||||
|
go-version-file: go.mod
|
||||||
|
- run: go test ./...
|
||||||
|
|
||||||
|
vet:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: https://gitea.com/actions/checkout@v4
|
||||||
|
- uses: https://gitea.com/actions/setup-go@v3
|
||||||
|
with:
|
||||||
|
go-version-file: go.mod
|
||||||
|
- run: go vet ./...
|
||||||
37
MIGRATION.md
Normal file
37
MIGRATION.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# Migration Guide
|
||||||
|
|
||||||
|
This guide documents all breaking API changes from the restructuring of go-extractor.
|
||||||
|
|
||||||
|
All core interfaces (`Browser`, `Document`, `Node`, `CookieJar`, `InteractiveBrowser`) are **unchanged**.
|
||||||
|
|
||||||
|
## Type and Function Renames
|
||||||
|
|
||||||
|
```
|
||||||
|
extractor.NewPlayWrightBrowser -> extractor.NewBrowser
|
||||||
|
extractor.PlayWrightBrowserOptions -> extractor.BrowserOptions
|
||||||
|
extractor.PlayWrightBrowserSelection -> extractor.BrowserSelection
|
||||||
|
|
||||||
|
extractor.PlayWrightBrowserSelectionChromium -> extractor.BrowserChromium
|
||||||
|
extractor.PlayWrightBrowserSelectionFirefox -> extractor.BrowserFirefox
|
||||||
|
extractor.PlayWrightBrowserSelectionWebKit -> extractor.BrowserWebKit
|
||||||
|
```
|
||||||
|
|
||||||
|
## Field Renames (inside BrowserOptions)
|
||||||
|
|
||||||
|
```
|
||||||
|
.PlayWrightServerAddress -> .ServerAddress
|
||||||
|
.DontLaunchOnConnectFailure -> .RequireServer
|
||||||
|
```
|
||||||
|
|
||||||
|
The `RequireServer` field is semantically identical to `DontLaunchOnConnectFailure`:
|
||||||
|
|
||||||
|
- Old: `DontLaunchOnConnectFailure: true` meant "fail if can't connect to server"
|
||||||
|
- New: `RequireServer: true` means the same thing
|
||||||
|
|
||||||
|
## New Helper
|
||||||
|
|
||||||
|
```go
|
||||||
|
extractor.DeferClose(closer)
|
||||||
|
```
|
||||||
|
|
||||||
|
Nil-safe defer close helper. Replaces the `deferClose` functions that were previously copy-pasted across packages.
|
||||||
29
article_test.go
Normal file
29
article_test.go
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestArticle_ZeroValue(t *testing.T) {
|
||||||
|
var a Article
|
||||||
|
if a.Title != "" || a.Content != "" || a.Length != 0 {
|
||||||
|
t.Error("zero-value Article should have empty fields")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestArticle_FieldAssignment(t *testing.T) {
|
||||||
|
a := Article{
|
||||||
|
Title: "Test Title",
|
||||||
|
Content: "<p>hello</p>",
|
||||||
|
TextContent: "hello",
|
||||||
|
Length: 5,
|
||||||
|
Excerpt: "hello",
|
||||||
|
Byline: "Author",
|
||||||
|
SiteName: "Example",
|
||||||
|
Lang: "en",
|
||||||
|
}
|
||||||
|
if a.Title != "Test Title" {
|
||||||
|
t.Errorf("Title = %q, want %q", a.Title, "Test Title")
|
||||||
|
}
|
||||||
|
if a.Length != 5 {
|
||||||
|
t.Errorf("Length = %d, want 5", a.Length)
|
||||||
|
}
|
||||||
|
}
|
||||||
160
browser_init.go
Normal file
160
browser_init.go
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/playwright-community/playwright-go"
|
||||||
|
)
|
||||||
|
|
||||||
|
// browserInitResult holds the result of shared browser initialization.
|
||||||
|
type browserInitResult struct {
|
||||||
|
pw *playwright.Playwright
|
||||||
|
browser playwright.Browser
|
||||||
|
bctx playwright.BrowserContext
|
||||||
|
opt BrowserOptions
|
||||||
|
}
|
||||||
|
|
||||||
|
// initBrowser performs the shared browser initialization steps:
|
||||||
|
// start Playwright, select browser type, connect or launch, create context, load cookies.
|
||||||
|
func initBrowser(opt BrowserOptions) (*browserInitResult, error) {
|
||||||
|
pw, err := playwright.Run()
|
||||||
|
if err != nil {
|
||||||
|
err = playwright.Install()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to install playwright: %w", err)
|
||||||
|
}
|
||||||
|
pw, err = playwright.Run()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to start playwright: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var bt playwright.BrowserType
|
||||||
|
switch opt.Browser {
|
||||||
|
case BrowserChromium:
|
||||||
|
bt = pw.Chromium
|
||||||
|
if opt.ServerAddress == "" {
|
||||||
|
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM")
|
||||||
|
}
|
||||||
|
case BrowserFirefox:
|
||||||
|
bt = pw.Firefox
|
||||||
|
if opt.ServerAddress == "" {
|
||||||
|
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_FIREFOX")
|
||||||
|
}
|
||||||
|
case BrowserWebKit:
|
||||||
|
bt = pw.WebKit
|
||||||
|
if opt.ServerAddress == "" {
|
||||||
|
opt.ServerAddress = os.Getenv("PLAYWRIGHT_SERVER_ADDRESS_WEBKIT")
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, ErrInvalidBrowserSelection
|
||||||
|
}
|
||||||
|
|
||||||
|
var browser playwright.Browser
|
||||||
|
launch := true
|
||||||
|
|
||||||
|
if opt.ServerAddress != "" && !opt.UseLocalOnly {
|
||||||
|
launch = false
|
||||||
|
slog.Info("connecting to playwright server", "address", opt.ServerAddress)
|
||||||
|
var timeout float64 = 30000
|
||||||
|
browser, err = bt.Connect(opt.ServerAddress, playwright.BrowserTypeConnectOptions{Timeout: &timeout})
|
||||||
|
if err != nil {
|
||||||
|
if opt.RequireServer {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
slog.Warn("failed to connect to playwright server, launching local browser", "err", err)
|
||||||
|
launch = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if launch {
|
||||||
|
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
||||||
|
Headless: playwright.Bool(!opt.ShowBrowser),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to launch browser: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var viewport *playwright.Size
|
||||||
|
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
||||||
|
viewport = &playwright.Size{
|
||||||
|
Width: opt.Dimensions.Width,
|
||||||
|
Height: opt.Dimensions.Height,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var scheme *playwright.ColorScheme
|
||||||
|
if opt.DarkMode {
|
||||||
|
scheme = playwright.ColorSchemeDark
|
||||||
|
} else {
|
||||||
|
scheme = playwright.ColorSchemeNoPreference
|
||||||
|
}
|
||||||
|
|
||||||
|
bctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
||||||
|
UserAgent: playwright.String(opt.UserAgent),
|
||||||
|
Viewport: viewport,
|
||||||
|
ColorScheme: scheme,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create browser context: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if opt.CookieJar != nil {
|
||||||
|
cookies, err := opt.CookieJar.GetAll()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
||||||
|
}
|
||||||
|
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
||||||
|
for i, c := range cookies {
|
||||||
|
pwCookies[i] = cookieToPlaywrightOptionalCookie(c)
|
||||||
|
}
|
||||||
|
if err := bctx.AddCookies(pwCookies); err != nil {
|
||||||
|
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &browserInitResult{
|
||||||
|
pw: pw,
|
||||||
|
browser: browser,
|
||||||
|
bctx: bctx,
|
||||||
|
opt: opt,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeOptions merges variadic BrowserOptions into a base set of defaults.
|
||||||
|
func mergeOptions(base BrowserOptions, opts []BrowserOptions) BrowserOptions {
|
||||||
|
for _, o := range opts {
|
||||||
|
if o.UserAgent != "" {
|
||||||
|
base.UserAgent = o.UserAgent
|
||||||
|
}
|
||||||
|
if o.Browser != "" {
|
||||||
|
base.Browser = o.Browser
|
||||||
|
}
|
||||||
|
if o.Timeout != nil {
|
||||||
|
base.Timeout = o.Timeout
|
||||||
|
}
|
||||||
|
if o.CookieJar != nil {
|
||||||
|
base.CookieJar = o.CookieJar
|
||||||
|
}
|
||||||
|
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
||||||
|
base.Dimensions = o.Dimensions
|
||||||
|
}
|
||||||
|
if o.DarkMode {
|
||||||
|
base.DarkMode = true
|
||||||
|
}
|
||||||
|
if o.ServerAddress != "" {
|
||||||
|
base.ServerAddress = o.ServerAddress
|
||||||
|
}
|
||||||
|
if o.RequireServer {
|
||||||
|
base.RequireServer = true
|
||||||
|
}
|
||||||
|
if o.UseLocalOnly {
|
||||||
|
base.UseLocalOnly = true
|
||||||
|
}
|
||||||
|
base.ShowBrowser = o.ShowBrowser
|
||||||
|
}
|
||||||
|
return base
|
||||||
|
}
|
||||||
11
close.go
Normal file
11
close.go
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import "io"
|
||||||
|
|
||||||
|
// DeferClose safely closes an io.Closer, ignoring the error.
|
||||||
|
// Intended for use in defer statements.
|
||||||
|
func DeferClose(cl io.Closer) {
|
||||||
|
if cl != nil {
|
||||||
|
_ = cl.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
38
close_test.go
Normal file
38
close_test.go
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mockCloser struct {
|
||||||
|
closed bool
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCloser) Close() error {
|
||||||
|
m.closed = true
|
||||||
|
return m.err
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeferClose_Nil(t *testing.T) {
|
||||||
|
// Should not panic on nil.
|
||||||
|
DeferClose(nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeferClose_Valid(t *testing.T) {
|
||||||
|
m := &mockCloser{}
|
||||||
|
DeferClose(m)
|
||||||
|
if !m.closed {
|
||||||
|
t.Error("DeferClose did not call Close()")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeferClose_ErrorIgnored(t *testing.T) {
|
||||||
|
m := &mockCloser{err: errors.New("close error")}
|
||||||
|
// Should not panic even when Close returns an error.
|
||||||
|
DeferClose(m)
|
||||||
|
if !m.closed {
|
||||||
|
t.Error("DeferClose did not call Close()")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,7 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
@@ -12,9 +11,6 @@ import (
|
|||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
)
|
)
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
func main() {
|
func main() {
|
||||||
cmd := &cli.Command{
|
cmd := &cli.Command{
|
||||||
Name: "browser",
|
Name: "browser",
|
||||||
@@ -31,7 +27,7 @@ func main() {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(b)
|
defer extractor.DeferClose(b)
|
||||||
|
|
||||||
// now open the user specified url
|
// now open the user specified url
|
||||||
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, target, extractor.OpenPageOptions{})
|
||||||
@@ -39,7 +35,7 @@ func main() {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
article, err := extractor.Readability(ctx, doc)
|
article, err := extractor.Readability(ctx, doc)
|
||||||
|
|
||||||
@@ -74,6 +70,7 @@ func main() {
|
|||||||
err := cmd.Run(context.Background(), os.Args)
|
err := cmd.Run(context.Background(), os.Args)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,8 +43,8 @@ var Flags = BrowserFlags{
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
func FromCommand(ctx context.Context, cmd *cli.Command) (extractor.Browser, error) {
|
||||||
var opts extractor.PlayWrightBrowserOptions
|
var opts extractor.BrowserOptions
|
||||||
|
|
||||||
if ua := cmd.String("user-agent"); ua != "" {
|
if ua := cmd.String("user-agent"); ua != "" {
|
||||||
opts.UserAgent = ua
|
opts.UserAgent = ua
|
||||||
@@ -59,7 +59,7 @@ func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if b := cmd.String("browser"); b != "" {
|
if b := cmd.String("browser"); b != "" {
|
||||||
opts.Browser = extractor.PlayWrightBrowserSelection(b)
|
opts.Browser = extractor.BrowserSelection(b)
|
||||||
}
|
}
|
||||||
|
|
||||||
if cf := cmd.String("cookies-file"); cf != "" {
|
if cf := cmd.String("cookies-file"); cf != "" {
|
||||||
@@ -72,5 +72,5 @@ func FromCommand(_ context.Context, cmd *cli.Command) (extractor.Browser, error)
|
|||||||
|
|
||||||
opts.ShowBrowser = cmd.Bool("visible")
|
opts.ShowBrowser = cmd.Bool("visible")
|
||||||
|
|
||||||
return extractor.NewPlayWrightBrowser(opts)
|
return extractor.NewBrowser(ctx, opts)
|
||||||
}
|
}
|
||||||
|
|||||||
38
cookiejar.go
38
cookiejar.go
@@ -25,26 +25,28 @@ func (c Cookie) IsTargetMatch(target string) (bool, error) {
|
|||||||
// the host of the cookie is the same as the host of the target
|
// the host of the cookie is the same as the host of the target
|
||||||
// if the cookie host starts with a dot, that means it matches any subdomain
|
// if the cookie host starts with a dot, that means it matches any subdomain
|
||||||
if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) {
|
if c.Host == u.Host || strings.HasPrefix(c.Host, ".") && strings.HasSuffix(u.Host, c.Host) {
|
||||||
if c.Path != "" {
|
if c.Path == "" {
|
||||||
if !strings.HasPrefix(u.Path, c.Path) {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// if the cookie path is a prefix of the target path, then it's a match
|
|
||||||
// so now these would both match:
|
|
||||||
// cookie path: /foo
|
|
||||||
// target path: /foo/bar
|
|
||||||
// cookie path: /foo
|
|
||||||
// target path: /foosball
|
|
||||||
// because foseball is not an actual match, we need to check to see that either the path is an exact match
|
|
||||||
// or that the next character in the target path is a slash
|
|
||||||
|
|
||||||
if len(u.Path) > len(c.Path) && u.Path[len(c.Path)] != '/' {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !strings.HasPrefix(u.Path, c.Path) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the cookie path is a prefix of the target path, then it's a match
|
||||||
|
// so now these would both match:
|
||||||
|
// cookie path: /foo
|
||||||
|
// target path: /foo/bar
|
||||||
|
// cookie path: /foo
|
||||||
|
// target path: /foosball
|
||||||
|
// because foseball is not an actual match, we need to check to see that either the path is an exact match
|
||||||
|
// or that the next character in the target path is a slash
|
||||||
|
|
||||||
|
if len(u.Path) > len(c.Path) && !strings.HasSuffix(c.Path, "/") && u.Path[len(c.Path)] != '/' {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return false, nil
|
return false, nil
|
||||||
|
|||||||
266
cookiejar_test.go
Normal file
266
cookiejar_test.go
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_ExactHost(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: "/"}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
t.Error("expected match for exact host")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_DotPrefix(t *testing.T) {
|
||||||
|
c := Cookie{Host: ".example.com", Path: "/"}
|
||||||
|
match, err := c.IsTargetMatch("https://sub.example.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
t.Error("expected match for .example.com against sub.example.com")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_DotPrefix_NoFalsePositive(t *testing.T) {
|
||||||
|
c := Cookie{Host: ".example.com", Path: "/"}
|
||||||
|
match, err := c.IsTargetMatch("https://notexample.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if match {
|
||||||
|
t.Error("did not expect .example.com to match notexample.com")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_PathExact(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/foo")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
t.Error("expected match for exact path /foo")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_PathPrefix(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/foo/bar")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
t.Error("expected match for /foo prefix with /foo/bar")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_PathBoundary(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: "/foo"}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/foosball")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if match {
|
||||||
|
t.Error("did not expect /foo to match /foosball")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_EmptyPath(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: ""}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/anything")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
t.Error("expected empty path cookie to match any path")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_NoMatch(t *testing.T) {
|
||||||
|
c := Cookie{Host: "other.com", Path: "/"}
|
||||||
|
match, err := c.IsTargetMatch("https://example.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if match {
|
||||||
|
t.Error("did not expect other.com to match example.com")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCookie_IsTargetMatch_InvalidURL(t *testing.T) {
|
||||||
|
c := Cookie{Host: "example.com", Path: "/"}
|
||||||
|
_, err := c.IsTargetMatch("://invalid")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for invalid URL")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_GetAll(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{
|
||||||
|
Cookie{Host: "a.com", Name: "a", Value: "1"},
|
||||||
|
Cookie{Host: "b.com", Name: "b", Value: "2"},
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, err := jar.GetAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetAll() error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cookies) != 2 {
|
||||||
|
t.Errorf("GetAll() returned %d cookies, want 2", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_Get(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
Cookie{Host: "other.com", Path: "/", Name: "b", Value: "2"},
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, err := jar.Get("https://example.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Get() error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("Get() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
if cookies[0].Name != "a" {
|
||||||
|
t.Errorf("Get() cookie name = %q, want %q", cookies[0].Name, "a")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_Set_New(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{}
|
||||||
|
err := jar.Set(Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Set() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("after Set, GetAll() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
if cookies[0].Value != "1" {
|
||||||
|
t.Errorf("cookie value = %q, want %q", cookies[0].Value, "1")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_Set_Update(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
err := jar.Set(Cookie{Host: "example.com", Path: "/", Name: "a", Value: "2"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Set() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("after update Set, GetAll() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
if cookies[0].Value != "2" {
|
||||||
|
t.Errorf("cookie value = %q, want %q", cookies[0].Value, "2")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_Delete(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
Cookie{Host: "other.com", Path: "/", Name: "b", Value: "2"},
|
||||||
|
}
|
||||||
|
err := jar.Delete(Cookie{Host: "example.com", Path: "/", Name: "a"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Delete() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("after Delete, GetAll() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
if cookies[0].Name != "b" {
|
||||||
|
t.Errorf("remaining cookie name = %q, want %q", cookies[0].Name, "b")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStaticCookieJar_Delete_NotFound(t *testing.T) {
|
||||||
|
jar := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
err := jar.Delete(Cookie{Host: "nonexistent.com", Path: "/", Name: "x"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Delete() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("after no-op Delete, GetAll() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadOnlyCookieJar_SetIsNoop(t *testing.T) {
|
||||||
|
inner := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
ro := ReadOnlyCookieJar{Jar: inner}
|
||||||
|
|
||||||
|
err := ro.Set(Cookie{Host: "example.com", Path: "/", Name: "new", Value: "val"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Set() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := inner.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("ReadOnlyCookieJar.Set should be noop, but inner jar has %d cookies", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadOnlyCookieJar_DeleteIsNoop(t *testing.T) {
|
||||||
|
inner := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
ro := ReadOnlyCookieJar{Jar: inner}
|
||||||
|
|
||||||
|
err := ro.Delete(Cookie{Host: "example.com", Path: "/", Name: "a"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Delete() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := inner.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("ReadOnlyCookieJar.Delete should be noop, but inner jar has %d cookies", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadOnlyCookieJar_GetAll(t *testing.T) {
|
||||||
|
inner := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
ro := ReadOnlyCookieJar{Jar: inner}
|
||||||
|
|
||||||
|
cookies, err := ro.GetAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetAll() error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("ReadOnlyCookieJar.GetAll() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadOnlyCookieJar_Get(t *testing.T) {
|
||||||
|
inner := &staticCookieJar{
|
||||||
|
Cookie{Host: "example.com", Path: "/", Name: "a", Value: "1"},
|
||||||
|
}
|
||||||
|
ro := ReadOnlyCookieJar{Jar: inner}
|
||||||
|
|
||||||
|
cookies, err := ro.Get("https://example.com/page")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Get() error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("ReadOnlyCookieJar.Get() returned %d cookies, want 1", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
189
cookies_txt_test.go
Normal file
189
cookies_txt_test.go
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func writeTempCookieFile(t *testing.T, content string) string {
|
||||||
|
t.Helper()
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "cookies.txt")
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||||
|
t.Fatalf("failed to write temp cookie file: %v", err)
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_Valid(t *testing.T) {
|
||||||
|
content := ".example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("expected 1 cookie, got %d", len(cookies))
|
||||||
|
}
|
||||||
|
|
||||||
|
c := cookies[0]
|
||||||
|
if c.Host != ".example.com" {
|
||||||
|
t.Errorf("Host = %q, want %q", c.Host, ".example.com")
|
||||||
|
}
|
||||||
|
if !c.HttpOnly {
|
||||||
|
t.Error("HttpOnly = false, want true")
|
||||||
|
}
|
||||||
|
if c.Path != "/" {
|
||||||
|
t.Errorf("Path = %q, want %q", c.Path, "/")
|
||||||
|
}
|
||||||
|
if c.Secure {
|
||||||
|
t.Error("Secure = true, want false")
|
||||||
|
}
|
||||||
|
if c.Name != "session" {
|
||||||
|
t.Errorf("Name = %q, want %q", c.Name, "session")
|
||||||
|
}
|
||||||
|
if c.Value != "abc123" {
|
||||||
|
t.Errorf("Value = %q, want %q", c.Value, "abc123")
|
||||||
|
}
|
||||||
|
if c.Expires.Unix() != 1700000000 {
|
||||||
|
t.Errorf("Expires = %d, want 1700000000", c.Expires.Unix())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_Comments(t *testing.T) {
|
||||||
|
content := "# This is a comment\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("expected 1 cookie (comment skipped), got %d", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_EmptyLines(t *testing.T) {
|
||||||
|
content := "\n\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("expected 1 cookie (empty lines skipped), got %d", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_ShortLines(t *testing.T) {
|
||||||
|
content := "too\tfew\tfields\n.example.com\tTRUE\t/\tFALSE\t1700000000\tsession\tabc123\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Errorf("expected 1 cookie (short line skipped), got %d", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_InvalidExpiry(t *testing.T) {
|
||||||
|
content := ".example.com\tTRUE\t/\tFALSE\tnotanumber\tsession\tabc123\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 1 {
|
||||||
|
t.Fatalf("expected 1 cookie, got %d", len(cookies))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should have a default expiry ~180 days from now
|
||||||
|
now := time.Now()
|
||||||
|
expected := now.Add(180 * 24 * time.Hour)
|
||||||
|
diff := cookies[0].Expires.Sub(expected)
|
||||||
|
if diff < -time.Minute || diff > time.Minute {
|
||||||
|
t.Errorf("invalid expiry default: got %v, expected ~%v", cookies[0].Expires, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_HttpOnly(t *testing.T) {
|
||||||
|
content := ".example.com\tTRUE\t/\tFALSE\t1700000000\ta\t1\n.other.com\tFALSE\t/\tFALSE\t1700000000\tb\t2\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 2 {
|
||||||
|
t.Fatalf("expected 2 cookies, got %d", len(cookies))
|
||||||
|
}
|
||||||
|
|
||||||
|
if !cookies[0].HttpOnly {
|
||||||
|
t.Error("first cookie HttpOnly = false, want true")
|
||||||
|
}
|
||||||
|
if cookies[1].HttpOnly {
|
||||||
|
t.Error("second cookie HttpOnly = true, want false")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_Secure(t *testing.T) {
|
||||||
|
content := ".example.com\tFALSE\t/\tTRUE\t1700000000\ta\t1\n.other.com\tFALSE\t/\tFALSE\t1700000000\tb\t2\n"
|
||||||
|
path := writeTempCookieFile(t, content)
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 2 {
|
||||||
|
t.Fatalf("expected 2 cookies, got %d", len(cookies))
|
||||||
|
}
|
||||||
|
|
||||||
|
if !cookies[0].Secure {
|
||||||
|
t.Error("first cookie Secure = false, want true")
|
||||||
|
}
|
||||||
|
if cookies[1].Secure {
|
||||||
|
t.Error("second cookie Secure = true, want false")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_NonexistentFile(t *testing.T) {
|
||||||
|
_, err := LoadCookiesFile("/nonexistent/path/cookies.txt")
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for nonexistent file")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadCookiesFile_Empty(t *testing.T) {
|
||||||
|
path := writeTempCookieFile(t, "")
|
||||||
|
|
||||||
|
jar, err := LoadCookiesFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadCookiesFile() error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies, _ := jar.GetAll()
|
||||||
|
if len(cookies) != 0 {
|
||||||
|
t.Errorf("expected 0 cookies from empty file, got %d", len(cookies))
|
||||||
|
}
|
||||||
|
}
|
||||||
28
document.go
28
document.go
@@ -25,30 +25,21 @@ type document struct {
|
|||||||
pw *playwright.Playwright
|
pw *playwright.Playwright
|
||||||
browser playwright.Browser
|
browser playwright.Browser
|
||||||
page playwright.Page
|
page playwright.Page
|
||||||
root playwright.ElementHandle
|
|
||||||
locator playwright.Locator
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
func newDocument(pw *playwright.Playwright, browser playwright.Browser, page playwright.Page) (Document, error) {
|
||||||
root, err := page.QuerySelector("html")
|
locator := page.Locator("html")
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
root2 := page.Locator("html")
|
|
||||||
|
|
||||||
res := &document{
|
res := &document{
|
||||||
node: node{
|
node: node{
|
||||||
locator: root2,
|
locator: locator,
|
||||||
},
|
},
|
||||||
pw: pw,
|
pw: pw,
|
||||||
browser: browser,
|
browser: browser,
|
||||||
page: page,
|
page: page,
|
||||||
root: root,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("new document", "url", page.URL(), "root", root, "locator", root2)
|
slog.Info("new document", "url", page.URL(), "locator", locator)
|
||||||
|
|
||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
@@ -78,21 +69,14 @@ func (d *document) Refresh() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
func (d *document) WaitForNetworkIdle(timeout *time.Duration) error {
|
||||||
|
|
||||||
var f *float64 = nil
|
|
||||||
if timeout == nil {
|
if timeout == nil {
|
||||||
t := 30 * time.Second
|
t := 30 * time.Second
|
||||||
timeout = &t
|
timeout = &t
|
||||||
}
|
}
|
||||||
|
|
||||||
if timeout != nil {
|
ms := float64(timeout.Milliseconds())
|
||||||
ms := float64(timeout.Milliseconds())
|
return d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||||
f = &ms
|
|
||||||
}
|
|
||||||
|
|
||||||
err := d.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
||||||
State: playwright.LoadStateNetworkidle,
|
State: playwright.LoadStateNetworkidle,
|
||||||
Timeout: f,
|
Timeout: &ms,
|
||||||
})
|
})
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|||||||
10
go.mod
10
go.mod
@@ -1,12 +1,14 @@
|
|||||||
module gitea.stevedudenhoeffer.com/steve/go-extractor
|
module gitea.stevedudenhoeffer.com/steve/go-extractor
|
||||||
|
|
||||||
go 1.23.2
|
go 1.24.0
|
||||||
|
|
||||||
|
toolchain go1.24.1
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
|
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
|
||||||
github.com/playwright-community/playwright-go v0.5001.0
|
github.com/playwright-community/playwright-go v0.5200.0
|
||||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||||
golang.org/x/text v0.23.0
|
golang.org/x/text v0.29.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@@ -17,5 +19,5 @@ require (
|
|||||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||||
github.com/go-stack/stack v1.8.1 // indirect
|
github.com/go-stack/stack v1.8.1 // indirect
|
||||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||||
golang.org/x/net v0.37.0 // indirect
|
golang.org/x/net v0.44.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
110
go.sum
Normal file
110
go.sum
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||||
|
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||||
|
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
|
||||||
|
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
|
||||||
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+ZlfuyaAdFlQ=
|
||||||
|
github.com/deckarep/golang-set/v2 v2.8.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
|
||||||
|
github.com/go-jose/go-jose/v3 v3.0.4 h1:Wp5HA7bLQcKnf6YYao/4kpRpVMp/yf6+pJKV8WFSaNY=
|
||||||
|
github.com/go-jose/go-jose/v3 v3.0.4/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
|
||||||
|
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
|
||||||
|
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
|
||||||
|
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 h1:BYLNYdZaepitbZreRIa9xeCQZocWmy/wj4cGIH0qyw0=
|
||||||
|
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612/go.mod h1:wgqthQa8SAYs0yyljVeCOQlZ027VW5CmLsbi9jWC08c=
|
||||||
|
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
|
||||||
|
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
|
||||||
|
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
|
||||||
|
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
|
||||||
|
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
|
||||||
|
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
|
||||||
|
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
|
||||||
|
github.com/playwright-community/playwright-go v0.5200.0 h1:z/5LGuX2tBrg3ug1HupMXLjIG93f1d2MWdDsNhkMQ9c=
|
||||||
|
github.com/playwright-community/playwright-go v0.5200.0/go.mod h1:UnnyQZaqUOO5ywAZu60+N4EiWReUqX1MQBBA3Oofvf8=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||||
|
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
|
||||||
|
github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
|
||||||
|
github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
|
||||||
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
|
github.com/urfave/cli/v3 v3.0.0-beta1 h1:6DTaaUarcM0wX7qj5Hcvs+5Dm3dyUTBbEwIWAjcw9Zg=
|
||||||
|
github.com/urfave/cli/v3 v3.0.0-beta1/go.mod h1:FnIeEMYu+ko8zP1F9Ypr3xkZMIDqW3DR92yUtY39q1Y=
|
||||||
|
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||||
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
|
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||||
|
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||||
|
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||||
|
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||||
|
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||||
|
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||||
|
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||||
|
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||||
|
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||||
|
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||||
|
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||||
|
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||||
|
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||||
|
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||||
|
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||||
|
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||||
|
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||||
|
golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
|
||||||
|
golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
|
||||||
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||||
|
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||||
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||||
|
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||||
|
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||||
|
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||||
|
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||||
|
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||||
|
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||||
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||||
|
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||||
|
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||||
|
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||||
|
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||||
|
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
|
||||||
|
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
|
||||||
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
|
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||||
|
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||||
|
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||||
|
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||||
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
222
interactive.go
Normal file
222
interactive.go
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/playwright-community/playwright-go"
|
||||||
|
)
|
||||||
|
|
||||||
|
// InteractiveBrowser provides low-level page control for interactive browser sessions.
|
||||||
|
// Unlike Browser which is designed for scraping, InteractiveBrowser exposes mouse, keyboard,
|
||||||
|
// screenshot, and navigation APIs suitable for remote browser control.
|
||||||
|
type InteractiveBrowser interface {
|
||||||
|
// Navigate goes to the given URL and returns the final URL after any redirects.
|
||||||
|
Navigate(url string) (string, error)
|
||||||
|
// GoBack navigates back in history. Returns the final URL.
|
||||||
|
GoBack() (string, error)
|
||||||
|
// GoForward navigates forward in history. Returns the final URL.
|
||||||
|
GoForward() (string, error)
|
||||||
|
// URL returns the current page URL.
|
||||||
|
URL() string
|
||||||
|
|
||||||
|
// MouseClick clicks at the given coordinates with the specified button ("left", "middle", "right").
|
||||||
|
MouseClick(x, y float64, button string) error
|
||||||
|
// MouseMove moves the mouse to the given coordinates.
|
||||||
|
MouseMove(x, y float64) error
|
||||||
|
// MouseWheel scrolls by the given delta.
|
||||||
|
MouseWheel(deltaX, deltaY float64) error
|
||||||
|
|
||||||
|
// KeyboardType types the given text as if it were entered character by character.
|
||||||
|
KeyboardType(text string) error
|
||||||
|
// KeyboardPress presses a special key (e.g. "Enter", "Tab", "Backspace").
|
||||||
|
KeyboardPress(key string) error
|
||||||
|
// KeyboardInsertText inserts text directly into the focused element by dispatching
|
||||||
|
// only an input event (no keydown, keypress, or keyup). This is more reliable than
|
||||||
|
// KeyboardType for pasting into password fields and custom input components.
|
||||||
|
KeyboardInsertText(text string) error
|
||||||
|
|
||||||
|
// Screenshot takes a full-page screenshot as JPEG with the given quality (0-100).
|
||||||
|
Screenshot(quality int) ([]byte, error)
|
||||||
|
|
||||||
|
// Cookies returns all cookies from the browser context.
|
||||||
|
Cookies() ([]Cookie, error)
|
||||||
|
|
||||||
|
// Close tears down the browser.
|
||||||
|
Close() error
|
||||||
|
}
|
||||||
|
|
||||||
|
type interactiveBrowser struct {
|
||||||
|
pw *playwright.Playwright
|
||||||
|
browser playwright.Browser
|
||||||
|
ctx playwright.BrowserContext
|
||||||
|
page playwright.Page
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewInteractiveBrowser creates a headless browser with a page ready for interactive control.
|
||||||
|
// The context is only used for cancellation during setup.
|
||||||
|
func NewInteractiveBrowser(ctx context.Context, opts ...BrowserOptions) (InteractiveBrowser, error) {
|
||||||
|
var thirtySeconds = 30 * time.Second
|
||||||
|
opt := mergeOptions(BrowserOptions{
|
||||||
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||||
|
Browser: BrowserChromium,
|
||||||
|
Timeout: &thirtySeconds,
|
||||||
|
Dimensions: Size{
|
||||||
|
Width: 1280,
|
||||||
|
Height: 720,
|
||||||
|
},
|
||||||
|
}, opts)
|
||||||
|
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
type result struct {
|
||||||
|
ib InteractiveBrowser
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
ch := make(chan result, 1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
res, err := initBrowser(opt)
|
||||||
|
if err != nil {
|
||||||
|
ch <- result{nil, err}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
page, err := res.bctx.NewPage()
|
||||||
|
if err != nil {
|
||||||
|
ch <- result{nil, fmt.Errorf("failed to create page: %w", err)}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ch <- result{
|
||||||
|
ib: &interactiveBrowser{
|
||||||
|
pw: res.pw,
|
||||||
|
browser: res.browser,
|
||||||
|
ctx: res.bctx,
|
||||||
|
page: page,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil, ctx.Err()
|
||||||
|
case r := <-ch:
|
||||||
|
return r.ib, r.err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) Navigate(url string) (string, error) {
|
||||||
|
_, err := ib.page.Goto(url, playwright.PageGotoOptions{
|
||||||
|
WaitUntil: playwright.WaitUntilStateLoad,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("navigation failed: %w", err)
|
||||||
|
}
|
||||||
|
return ib.page.URL(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) GoBack() (string, error) {
|
||||||
|
_, err := ib.page.GoBack()
|
||||||
|
if err != nil {
|
||||||
|
return ib.page.URL(), fmt.Errorf("go back failed: %w", err)
|
||||||
|
}
|
||||||
|
return ib.page.URL(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) GoForward() (string, error) {
|
||||||
|
_, err := ib.page.GoForward()
|
||||||
|
if err != nil {
|
||||||
|
return ib.page.URL(), fmt.Errorf("go forward failed: %w", err)
|
||||||
|
}
|
||||||
|
return ib.page.URL(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) URL() string {
|
||||||
|
return ib.page.URL()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) MouseClick(x, y float64, button string) error {
|
||||||
|
var btn *playwright.MouseButton
|
||||||
|
switch button {
|
||||||
|
case "right":
|
||||||
|
btn = playwright.MouseButtonRight
|
||||||
|
case "middle":
|
||||||
|
btn = playwright.MouseButtonMiddle
|
||||||
|
default:
|
||||||
|
btn = playwright.MouseButtonLeft
|
||||||
|
}
|
||||||
|
return ib.page.Mouse().Click(x, y, playwright.MouseClickOptions{Button: btn})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) MouseMove(x, y float64) error {
|
||||||
|
return ib.page.Mouse().Move(x, y)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) MouseWheel(deltaX, deltaY float64) error {
|
||||||
|
return ib.page.Mouse().Wheel(deltaX, deltaY)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) KeyboardType(text string) error {
|
||||||
|
return ib.page.Keyboard().Type(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) KeyboardPress(key string) error {
|
||||||
|
return ib.page.Keyboard().Press(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) KeyboardInsertText(text string) error {
|
||||||
|
return ib.page.Keyboard().InsertText(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) Screenshot(quality int) ([]byte, error) {
|
||||||
|
return ib.page.Screenshot(playwright.PageScreenshotOptions{
|
||||||
|
Type: playwright.ScreenshotTypeJpeg,
|
||||||
|
Quality: playwright.Int(quality),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) Cookies() ([]Cookie, error) {
|
||||||
|
pwCookies, err := ib.ctx.Cookies()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get cookies: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies := make([]Cookie, len(pwCookies))
|
||||||
|
for i, c := range pwCookies {
|
||||||
|
cookies[i] = playwrightCookieToCookie(c)
|
||||||
|
}
|
||||||
|
return cookies, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ib *interactiveBrowser) Close() error {
|
||||||
|
var errs []error
|
||||||
|
if ib.page != nil {
|
||||||
|
if err := ib.page.Close(); err != nil {
|
||||||
|
errs = append(errs, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ib.ctx != nil {
|
||||||
|
if err := ib.ctx.Close(); err != nil {
|
||||||
|
errs = append(errs, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ib.browser != nil {
|
||||||
|
if err := ib.browser.Close(); err != nil {
|
||||||
|
errs = append(errs, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ib.pw != nil {
|
||||||
|
if err := ib.pw.Stop(); err != nil {
|
||||||
|
errs = append(errs, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return fmt.Errorf("errors during close: %v", errs)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
16
mock_test.go
Normal file
16
mock_test.go
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// mockDocument implements the Document interface for testing without Playwright.
|
||||||
|
type mockDocument struct {
|
||||||
|
mockNode
|
||||||
|
url string
|
||||||
|
content string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m mockDocument) URL() string { return m.url }
|
||||||
|
func (m mockDocument) Refresh() error { return nil }
|
||||||
|
func (m mockDocument) Content() (string, error) { return m.content, nil }
|
||||||
|
func (m mockDocument) Close() error { return nil }
|
||||||
|
func (m mockDocument) WaitForNetworkIdle(_ *time.Duration) error { return nil }
|
||||||
23
node_test.go
Normal file
23
node_test.go
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestEscapeJavaScript(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"hello", "hello"},
|
||||||
|
{"it's", "it\\'s"},
|
||||||
|
{`back\slash`, `back\\slash`},
|
||||||
|
{`both\'`, `both\\\'`},
|
||||||
|
{"", ""},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
got := escapeJavaScript(tt.input)
|
||||||
|
if got != tt.want {
|
||||||
|
t.Errorf("escapeJavaScript(%q) = %q, want %q", tt.input, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
3
nodes.go
3
nodes.go
@@ -13,6 +13,9 @@ func (n Nodes) Select(selector string) Nodes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (d Nodes) First() Node {
|
func (d Nodes) First() Node {
|
||||||
|
if len(d) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return d[0]
|
return d[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
111
nodes_test.go
Normal file
111
nodes_test.go
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mockNode implements the Node interface for testing.
|
||||||
|
type mockNode struct {
|
||||||
|
text string
|
||||||
|
textErr error
|
||||||
|
content string
|
||||||
|
children Nodes
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m mockNode) Content() (string, error) { return m.content, nil }
|
||||||
|
func (m mockNode) Text() (string, error) { return m.text, m.textErr }
|
||||||
|
func (m mockNode) Attr(_ string) (string, error) { return "", nil }
|
||||||
|
func (m mockNode) Screenshot() ([]byte, error) { return nil, nil }
|
||||||
|
func (m mockNode) Type(_ string) error { return nil }
|
||||||
|
func (m mockNode) Click() error { return nil }
|
||||||
|
func (m mockNode) Select(_ string) Nodes { return m.children }
|
||||||
|
func (m mockNode) SelectFirst(_ string) Node { return m.children.First() }
|
||||||
|
func (m mockNode) ForEach(_ string, _ func(Node) error) error { return nil }
|
||||||
|
func (m mockNode) SetHidden(_ bool) error { return nil }
|
||||||
|
func (m mockNode) SetAttribute(_, _ string) error { return nil }
|
||||||
|
|
||||||
|
func TestNodes_First_Empty(t *testing.T) {
|
||||||
|
var nodes Nodes
|
||||||
|
got := nodes.First()
|
||||||
|
if got != nil {
|
||||||
|
t.Errorf("First() on empty Nodes = %v, want nil", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_First_NonEmpty(t *testing.T) {
|
||||||
|
n1 := mockNode{text: "first"}
|
||||||
|
n2 := mockNode{text: "second"}
|
||||||
|
nodes := Nodes{n1, n2}
|
||||||
|
|
||||||
|
got := nodes.First()
|
||||||
|
if got == nil {
|
||||||
|
t.Fatal("First() on non-empty Nodes returned nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
text, _ := got.Text()
|
||||||
|
if text != "first" {
|
||||||
|
t.Errorf("First().Text() = %q, want %q", text, "first")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_Select(t *testing.T) {
|
||||||
|
child1 := mockNode{text: "child1"}
|
||||||
|
child2 := mockNode{text: "child2"}
|
||||||
|
child3 := mockNode{text: "child3"}
|
||||||
|
|
||||||
|
n1 := mockNode{children: Nodes{child1, child2}}
|
||||||
|
n2 := mockNode{children: Nodes{child3}}
|
||||||
|
|
||||||
|
nodes := Nodes{n1, n2}
|
||||||
|
result := nodes.Select("anything")
|
||||||
|
|
||||||
|
if len(result) != 3 {
|
||||||
|
t.Errorf("Select() returned %d nodes, want 3", len(result))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_Select_Empty(t *testing.T) {
|
||||||
|
var nodes Nodes
|
||||||
|
result := nodes.Select("anything")
|
||||||
|
if len(result) != 0 {
|
||||||
|
t.Errorf("Select() on empty Nodes returned %d nodes, want 0", len(result))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_ExtractText(t *testing.T) {
|
||||||
|
n1 := mockNode{text: "hello"}
|
||||||
|
n2 := mockNode{text: "world"}
|
||||||
|
nodes := Nodes{n1, n2}
|
||||||
|
|
||||||
|
texts, err := nodes.ExtractText()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractText() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(texts) != 2 || texts[0] != "hello" || texts[1] != "world" {
|
||||||
|
t.Errorf("ExtractText() = %v, want [hello world]", texts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_ExtractText_Error(t *testing.T) {
|
||||||
|
n1 := mockNode{text: "hello"}
|
||||||
|
n2 := mockNode{textErr: fmt.Errorf("text error")}
|
||||||
|
nodes := Nodes{n1, n2}
|
||||||
|
|
||||||
|
_, err := nodes.ExtractText()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("ExtractText() expected error, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNodes_ExtractText_Empty(t *testing.T) {
|
||||||
|
var nodes Nodes
|
||||||
|
texts, err := nodes.ExtractText()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractText() error = %v", err)
|
||||||
|
}
|
||||||
|
if len(texts) != 0 {
|
||||||
|
t.Errorf("ExtractText() on empty = %v, want empty", texts)
|
||||||
|
}
|
||||||
|
}
|
||||||
203
playwright.go
203
playwright.go
@@ -4,9 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
@@ -24,7 +22,7 @@ type playWrightBrowser struct {
|
|||||||
|
|
||||||
var _ Browser = playWrightBrowser{}
|
var _ Browser = playWrightBrowser{}
|
||||||
|
|
||||||
type PlayWrightBrowserSelection string
|
type BrowserSelection string
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
ErrInvalidBrowserSelection = errors.New("invalid browser selection")
|
||||||
@@ -33,18 +31,18 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
PlayWrightBrowserSelectionChromium PlayWrightBrowserSelection = "chromium"
|
BrowserChromium BrowserSelection = "chromium"
|
||||||
PlayWrightBrowserSelectionFirefox PlayWrightBrowserSelection = "firefox"
|
BrowserFirefox BrowserSelection = "firefox"
|
||||||
PlayWrightBrowserSelectionWebKit PlayWrightBrowserSelection = "webkit"
|
BrowserWebKit BrowserSelection = "webkit"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Size struct {
|
type Size struct {
|
||||||
Width int
|
Width int
|
||||||
Height int
|
Height int
|
||||||
}
|
}
|
||||||
type PlayWrightBrowserOptions struct {
|
type BrowserOptions struct {
|
||||||
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
UserAgent string // If empty, defaults to "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"
|
||||||
Browser PlayWrightBrowserSelection // If unset defaults to Firefox.
|
Browser BrowserSelection // If unset defaults to Firefox.
|
||||||
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
Timeout *time.Duration // If unset defaults to 30 seconds timeout. If set to 0, no timeout
|
||||||
|
|
||||||
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
// CookieJar will, if set, load all cookies from the cookie jar into the browser and save all cookies from the
|
||||||
@@ -56,9 +54,16 @@ type PlayWrightBrowserOptions struct {
|
|||||||
Dimensions Size
|
Dimensions Size
|
||||||
DarkMode bool
|
DarkMode bool
|
||||||
|
|
||||||
// PlayWrightServerAddress is the address of a PlayWright server to connect to.
|
// ServerAddress is the address of a Playwright server to connect to.
|
||||||
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
|
// Defaults to the value of the environment variable PLAYWRIGHT_SERVER_ADDRESS.
|
||||||
PlayWrightServerAddress string
|
ServerAddress string
|
||||||
|
|
||||||
|
// RequireServer will, if set, return an error if the connection to the
|
||||||
|
// Playwright server fails instead of falling back to a local browser launch.
|
||||||
|
RequireServer bool
|
||||||
|
|
||||||
|
// UseLocalOnly will, if set, not connect to the Playwright server, and instead launch a local browser.
|
||||||
|
UseLocalOnly bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
func cookieToPlaywrightOptionalCookie(cookie Cookie) playwright.OptionalCookie {
|
||||||
@@ -83,139 +88,51 @@ func playwrightCookieToCookie(cookie playwright.Cookie) Cookie {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewPlayWrightBrowser(opts ...PlayWrightBrowserOptions) (Browser, error) {
|
func NewBrowser(ctx context.Context, opts ...BrowserOptions) (Browser, error) {
|
||||||
var thirtySeconds = 30 * time.Second
|
var thirtySeconds = 30 * time.Second
|
||||||
opt := PlayWrightBrowserOptions{
|
opt := mergeOptions(BrowserOptions{
|
||||||
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||||
Browser: PlayWrightBrowserSelectionFirefox,
|
Browser: BrowserFirefox,
|
||||||
Timeout: &thirtySeconds,
|
Timeout: &thirtySeconds,
|
||||||
DarkMode: false,
|
}, opts)
|
||||||
PlayWrightServerAddress: os.Getenv("PLAYWRIGHT_SERVER_ADDRESS"),
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, o := range opts {
|
if err := ctx.Err(); err != nil {
|
||||||
if o.UserAgent != "" {
|
|
||||||
opt.UserAgent = o.UserAgent
|
|
||||||
}
|
|
||||||
if o.Browser != "" {
|
|
||||||
opt.Browser = o.Browser
|
|
||||||
}
|
|
||||||
if o.Timeout != nil {
|
|
||||||
opt.Timeout = o.Timeout
|
|
||||||
}
|
|
||||||
if o.CookieJar != nil {
|
|
||||||
opt.CookieJar = o.CookieJar
|
|
||||||
}
|
|
||||||
if o.Dimensions.Width > 0 && o.Dimensions.Height > 0 {
|
|
||||||
opt.Dimensions = o.Dimensions
|
|
||||||
}
|
|
||||||
if o.DarkMode {
|
|
||||||
opt.DarkMode = true
|
|
||||||
}
|
|
||||||
|
|
||||||
opt.ShowBrowser = o.ShowBrowser
|
|
||||||
}
|
|
||||||
|
|
||||||
pw, err := playwright.Run()
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
err = playwright.Install()
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
pw, err = playwright.Run()
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var bt playwright.BrowserType
|
|
||||||
|
|
||||||
switch opt.Browser {
|
|
||||||
case PlayWrightBrowserSelectionChromium:
|
|
||||||
bt = pw.Chromium
|
|
||||||
|
|
||||||
case PlayWrightBrowserSelectionFirefox:
|
|
||||||
bt = pw.Firefox
|
|
||||||
|
|
||||||
case PlayWrightBrowserSelectionWebKit:
|
|
||||||
bt = pw.WebKit
|
|
||||||
|
|
||||||
default:
|
|
||||||
return nil, ErrInvalidBrowserSelection
|
|
||||||
}
|
|
||||||
var browser playwright.Browser
|
|
||||||
|
|
||||||
if opt.PlayWrightServerAddress != "" {
|
|
||||||
browser, err = bt.Connect(opt.PlayWrightServerAddress, playwright.BrowserTypeConnectOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
browser, err = bt.Launch(playwright.BrowserTypeLaunchOptions{
|
|
||||||
Headless: playwright.Bool(!opt.ShowBrowser),
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var viewport *playwright.Size
|
|
||||||
if opt.Dimensions.Width > 0 && opt.Dimensions.Height > 0 {
|
|
||||||
viewport = &playwright.Size{
|
|
||||||
Width: opt.Dimensions.Width,
|
|
||||||
Height: opt.Dimensions.Height,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var scheme *playwright.ColorScheme
|
|
||||||
|
|
||||||
if opt.DarkMode {
|
|
||||||
scheme = playwright.ColorSchemeDark
|
|
||||||
} else {
|
|
||||||
scheme = playwright.ColorSchemeNoPreference
|
|
||||||
}
|
|
||||||
|
|
||||||
c, err := browser.NewContext(playwright.BrowserNewContextOptions{
|
|
||||||
UserAgent: playwright.String(opt.UserAgent),
|
|
||||||
Viewport: viewport,
|
|
||||||
ColorScheme: scheme,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if opt.CookieJar != nil {
|
type browserResult struct {
|
||||||
cookies, err := opt.CookieJar.GetAll()
|
browser Browser
|
||||||
if err != nil {
|
err error
|
||||||
return nil, fmt.Errorf("error getting cookies from cookie jar: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
pwCookies := make([]playwright.OptionalCookie, len(cookies))
|
|
||||||
|
|
||||||
for i, cookie := range cookies {
|
|
||||||
pwCookies[i] = cookieToPlaywrightOptionalCookie(cookie)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = c.AddCookies(pwCookies)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error adding cookies to browser: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return playWrightBrowser{
|
resultCh := make(chan browserResult, 1)
|
||||||
pw: pw,
|
|
||||||
browser: browser,
|
go func() {
|
||||||
userAgent: opt.UserAgent,
|
res, err := initBrowser(opt)
|
||||||
timeout: *opt.Timeout,
|
if err != nil {
|
||||||
cookieJar: opt.CookieJar,
|
resultCh <- browserResult{nil, err}
|
||||||
ctx: c,
|
return
|
||||||
serverAddr: opt.PlayWrightServerAddress,
|
}
|
||||||
}, nil
|
|
||||||
|
resultCh <- browserResult{
|
||||||
|
browser: playWrightBrowser{
|
||||||
|
pw: res.pw,
|
||||||
|
browser: res.browser,
|
||||||
|
userAgent: res.opt.UserAgent,
|
||||||
|
timeout: *res.opt.Timeout,
|
||||||
|
cookieJar: res.opt.CookieJar,
|
||||||
|
ctx: res.bctx,
|
||||||
|
serverAddr: res.opt.ServerAddress,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil, ctx.Err()
|
||||||
|
case result := <-resultCh:
|
||||||
|
return result.browser, result.err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
func (b playWrightBrowser) updateCookies(_ context.Context, page playwright.Page) error {
|
||||||
@@ -261,10 +178,7 @@ func (b playWrightBrowser) openPage(_ context.Context, target string, opts OpenP
|
|||||||
|
|
||||||
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
slog.Info("opened document", "url", target, "status", resp.Status(), "request", resp.Request())
|
||||||
|
|
||||||
if resp.Status() != 200 {
|
if resp.Status() < 200 || resp.Status() >= 300 {
|
||||||
time.Sleep(999 * time.Hour * 24)
|
|
||||||
time.Sleep(25 * time.Second)
|
|
||||||
|
|
||||||
_ = page.Close()
|
_ = page.Close()
|
||||||
|
|
||||||
if resp.Status() == 404 {
|
if resp.Status() == 404 {
|
||||||
@@ -300,12 +214,9 @@ func (b playWrightBrowser) Close() error {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]byte, error) {
|
||||||
browser, err := NewPlayWrightBrowser(PlayWrightBrowserOptions{
|
browser, err := NewBrowser(ctx, BrowserOptions{
|
||||||
Timeout: &timeout,
|
Timeout: &timeout,
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -313,14 +224,14 @@ func Screenshot(ctx context.Context, target string, timeout time.Duration) ([]by
|
|||||||
return nil, fmt.Errorf("error creating browser: %w", err)
|
return nil, fmt.Errorf("error creating browser: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(browser)
|
defer DeferClose(browser)
|
||||||
|
|
||||||
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
doc, err := browser.Open(ctx, target, OpenPageOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error opening page: %w", err)
|
return nil, fmt.Errorf("error opening page: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer DeferClose(doc)
|
||||||
|
|
||||||
return doc.Screenshot()
|
return doc.Screenshot()
|
||||||
}
|
}
|
||||||
|
|||||||
72
readability_test.go
Normal file
72
readability_test.go
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestReadability_ValidHTML(t *testing.T) {
|
||||||
|
html := `<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Test Article</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<h1>Test Article</h1>
|
||||||
|
<p>This is a test article with enough content to be parsed by readability.
|
||||||
|
It needs to have a reasonable amount of text so the algorithm considers it
|
||||||
|
a valid article. Let us add several sentences to make sure this works
|
||||||
|
correctly. The readability library requires a minimum amount of content
|
||||||
|
to successfully extract an article from a page.</p>
|
||||||
|
<p>Here is another paragraph to add more content. We want to make sure
|
||||||
|
that the content is substantial enough for the readability algorithm to
|
||||||
|
consider this a valid article and extract the text properly.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
doc := mockDocument{
|
||||||
|
url: "https://example.com/article",
|
||||||
|
content: html,
|
||||||
|
}
|
||||||
|
|
||||||
|
article, err := Readability(context.Background(), doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Readability() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.Title != "Test Article" {
|
||||||
|
t.Errorf("Title = %q, want %q", article.Title, "Test Article")
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.TextContent == "" {
|
||||||
|
t.Error("TextContent should not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadability_EmptyContent(t *testing.T) {
|
||||||
|
doc := mockDocument{
|
||||||
|
url: "https://example.com/empty",
|
||||||
|
content: "",
|
||||||
|
}
|
||||||
|
|
||||||
|
article, err := Readability(context.Background(), doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Readability() unexpected error = %v", err)
|
||||||
|
}
|
||||||
|
// Empty content should produce an empty article.
|
||||||
|
if article.Title != "" && article.TextContent != "" {
|
||||||
|
t.Error("expected empty article from empty content")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadability_InvalidURL(t *testing.T) {
|
||||||
|
doc := mockDocument{
|
||||||
|
url: "://invalid",
|
||||||
|
content: "<html><body><p>text</p></body></html>",
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := Readability(context.Background(), doc)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("Readability() expected error for invalid URL, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,7 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -27,11 +26,6 @@ type Item struct {
|
|||||||
Price float64
|
Price float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
func GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||||
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
return DefaultConfig.GetItemFromURL(ctx, b, u)
|
||||||
}
|
}
|
||||||
@@ -57,18 +51,18 @@ func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.
|
|||||||
res.ID, _ = strconv.Atoi(a[3])
|
res.ID, _ = strconv.Atoi(a[3])
|
||||||
|
|
||||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return res, fmt.Errorf("failed to open page: %w", err)
|
return res, fmt.Errorf("failed to open page: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
names := doc.Select("h2.h4")
|
names := doc.Select(".h4")
|
||||||
|
|
||||||
if len(names) > 0 {
|
if len(names) > 0 {
|
||||||
res.Name, _ = names[0].Text()
|
res.Name, _ = names[0].Text()
|
||||||
}
|
}
|
||||||
|
|
||||||
prices := doc.Select("h4.h2")
|
prices := doc.Select(".h2")
|
||||||
|
|
||||||
if len(prices) > 0 {
|
if len(prices) > 0 {
|
||||||
priceStr, _ := prices[0].Text()
|
priceStr, _ := prices[0].Text()
|
||||||
|
|||||||
39
sites/aislegopher/aislegopher_test.go
Normal file
39
sites/aislegopher/aislegopher_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package aislegopher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/url"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetItemFromURL_InvalidHost(t *testing.T) {
|
||||||
|
u, _ := url.Parse("https://example.com/p/slug/123")
|
||||||
|
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||||
|
if err != ErrInvalidURL {
|
||||||
|
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetItemFromURL_InvalidPath_NoP(t *testing.T) {
|
||||||
|
u, _ := url.Parse("https://aislegopher.com/x/slug/123")
|
||||||
|
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||||
|
if err != ErrInvalidURL {
|
||||||
|
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetItemFromURL_InvalidPath_TooShort(t *testing.T) {
|
||||||
|
u, _ := url.Parse("https://aislegopher.com/p/slug")
|
||||||
|
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||||
|
if err != ErrInvalidURL {
|
||||||
|
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetItemFromURL_InvalidPath_TooLong(t *testing.T) {
|
||||||
|
u, _ := url.Parse("https://aislegopher.com/p/slug/123/extra")
|
||||||
|
_, err := GetItemFromURL(context.Background(), nil, u)
|
||||||
|
if err != ErrInvalidURL {
|
||||||
|
t.Errorf("expected ErrInvalidURL, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,10 +3,10 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/aislegopher"
|
||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
@@ -22,11 +22,6 @@ func (f AisleGopherFlags) ToConfig(_ *cli.Command) aislegopher.Config {
|
|||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func main() {
|
func main() {
|
||||||
var flags []cli.Flag
|
var flags []cli.Flag
|
||||||
flags = append(flags, browser.Flags...)
|
flags = append(flags, browser.Flags...)
|
||||||
@@ -44,7 +39,7 @@ func main() {
|
|||||||
return fmt.Errorf("failed to create browser: %w", err)
|
return fmt.Errorf("failed to create browser: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(b)
|
defer extractor.DeferClose(b)
|
||||||
|
|
||||||
arg := c.Args().First()
|
arg := c.Args().First()
|
||||||
|
|
||||||
@@ -72,6 +67,7 @@ func main() {
|
|||||||
err := cli.Run(context.Background(), os.Args)
|
err := cli.Run(context.Background(), os.Args)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -39,12 +38,6 @@ func (c Config) validate() Config {
|
|||||||
|
|
||||||
var DefaultConfig = Config{}
|
var DefaultConfig = Config{}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
// IsArchived checks if a url is archived. It returns the archived url if it is archived, or an empty string if it is not.
|
||||||
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
func (c Config) IsArchived(ctx context.Context, b extractor.Browser, target string) (extractor.Document, error) {
|
||||||
c = c.validate()
|
c = c.validate()
|
||||||
@@ -130,10 +123,9 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
|||||||
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
fmt.Println("context already done before entering the loop:", ctx.Err())
|
slog.Debug("context already done before entering the loop", "err", ctx.Err())
|
||||||
return nil, ctx.Err()
|
return nil, ctx.Err()
|
||||||
default:
|
default:
|
||||||
fmt.Println("context not done yet")
|
|
||||||
// Proceed with the loop
|
// Proceed with the loop
|
||||||
}
|
}
|
||||||
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
// now we are waiting for archive.ph to archive the page and redirect us to the archived page
|
||||||
@@ -141,6 +133,9 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
|||||||
// if the page path starts with /wip/ then we are still waiting
|
// if the page path starts with /wip/ then we are still waiting
|
||||||
// also periodically refresh the page just in case
|
// also periodically refresh the page just in case
|
||||||
|
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
keepGoing := true
|
keepGoing := true
|
||||||
for keepGoing {
|
for keepGoing {
|
||||||
select {
|
select {
|
||||||
@@ -148,14 +143,14 @@ func (c Config) Archive(ctx context.Context, b extractor.Browser, target string)
|
|||||||
slog.Info("context done")
|
slog.Info("context done")
|
||||||
keepGoing = false
|
keepGoing = false
|
||||||
|
|
||||||
case <-time.NewTicker(5 * time.Second).C:
|
case <-ticker.C:
|
||||||
archivedUrl, err := url.Parse(doc.URL())
|
archivedUrl, err := url.Parse(doc.URL())
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("checking url:", archivedUrl.String())
|
slog.Debug("checking url", "url", archivedUrl.String())
|
||||||
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
// if the url is not the same as the endpoint, or the path does not start with /wip/ or /submit then we are done
|
||||||
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
if archivedUrl.Hostname() != endpoint.Hostname() || (!strings.HasPrefix(archivedUrl.Path, "/wip/") && !strings.HasPrefix(archivedUrl.Path, "/submit")) {
|
||||||
keepGoing = false
|
keepGoing = false
|
||||||
|
|||||||
37
sites/archive/archive_test.go
Normal file
37
sites/archive/archive_test.go
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
package archive
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||||
|
c := Config{}
|
||||||
|
c = c.validate()
|
||||||
|
|
||||||
|
if c.Endpoint != "https://archive.ph" {
|
||||||
|
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.ph")
|
||||||
|
}
|
||||||
|
if c.Timeout == nil {
|
||||||
|
t.Fatal("Timeout should not be nil after validate")
|
||||||
|
}
|
||||||
|
if *c.Timeout != 1*time.Hour {
|
||||||
|
t.Errorf("Timeout = %v, want %v", *c.Timeout, 1*time.Hour)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||||
|
timeout := 5 * time.Minute
|
||||||
|
c := Config{
|
||||||
|
Endpoint: "https://archive.org",
|
||||||
|
Timeout: &timeout,
|
||||||
|
}
|
||||||
|
c = c.validate()
|
||||||
|
|
||||||
|
if c.Endpoint != "https://archive.org" {
|
||||||
|
t.Errorf("Endpoint = %q, want %q", c.Endpoint, "https://archive.org")
|
||||||
|
}
|
||||||
|
if *c.Timeout != 5*time.Minute {
|
||||||
|
t.Errorf("Timeout = %v, want %v", *c.Timeout, 5*time.Minute)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,7 +28,7 @@ var Flags = ArchiveFlags{
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Config {
|
func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) (archive.Config, error) {
|
||||||
c := archive.DefaultConfig
|
c := archive.DefaultConfig
|
||||||
|
|
||||||
if e := cmd.String("endpoint"); e != "" {
|
if e := cmd.String("endpoint"); e != "" {
|
||||||
@@ -38,12 +38,12 @@ func (f ArchiveFlags) ToConfig(_ context.Context, cmd *cli.Command) archive.Conf
|
|||||||
if t := cmd.String("timeout"); t != "" {
|
if t := cmd.String("timeout"); t != "" {
|
||||||
d, err := time.ParseDuration(t)
|
d, err := time.ParseDuration(t)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
return c, fmt.Errorf("invalid timeout duration: %w", err)
|
||||||
}
|
}
|
||||||
c.Timeout = &d
|
c.Timeout = &d
|
||||||
}
|
}
|
||||||
|
|
||||||
return c
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -122,7 +122,8 @@ func main() {
|
|||||||
err := cli.Run(context.Background(), os.Args)
|
err := cli.Run(context.Background(), os.Args)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,12 +3,13 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/urfave/cli/v3"
|
|
||||||
"io"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/urfave/cli/v3"
|
||||||
|
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||||
)
|
)
|
||||||
@@ -26,7 +27,7 @@ var Flags = DuckDuckGoFlags{
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) duckduckgo.Config {
|
func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) (duckduckgo.Config, error) {
|
||||||
var res = duckduckgo.DefaultConfig
|
var res = duckduckgo.DefaultConfig
|
||||||
|
|
||||||
if r := cmd.String("region"); r != "" {
|
if r := cmd.String("region"); r != "" {
|
||||||
@@ -42,17 +43,11 @@ func (f DuckDuckGoFlags) ToConfig(cmd *cli.Command) duckduckgo.Config {
|
|||||||
case "off":
|
case "off":
|
||||||
res.SafeSearch = duckduckgo.SafeSearchOff
|
res.SafeSearch = duckduckgo.SafeSearchOff
|
||||||
default:
|
default:
|
||||||
panic("invalid safe search value")
|
return res, fmt.Errorf("invalid safe search value: %s", s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return res
|
return res, nil
|
||||||
}
|
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -66,8 +61,10 @@ func main() {
|
|||||||
Usage: "Search DuckDuckGo",
|
Usage: "Search DuckDuckGo",
|
||||||
Flags: flags,
|
Flags: flags,
|
||||||
Action: func(ctx context.Context, command *cli.Command) error {
|
Action: func(ctx context.Context, command *cli.Command) error {
|
||||||
c := Flags.ToConfig(command)
|
c, err := Flags.ToConfig(command)
|
||||||
defer deferClose(nil)
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
query := strings.TrimSpace(strings.Join(command.Args().Slice(), " "))
|
query := strings.TrimSpace(strings.Join(command.Args().Slice(), " "))
|
||||||
|
|
||||||
@@ -76,7 +73,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
b, err := browser.FromCommand(ctx, command)
|
b, err := browser.FromCommand(ctx, command)
|
||||||
defer deferClose(b)
|
defer extractor.DeferClose(b)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to create browser: %w", err)
|
return fmt.Errorf("failed to create browser: %w", err)
|
||||||
@@ -87,7 +84,7 @@ func main() {
|
|||||||
return fmt.Errorf("failed to open search: %w", err)
|
return fmt.Errorf("failed to open search: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(search)
|
defer extractor.DeferClose(search)
|
||||||
|
|
||||||
res := search.GetResults()
|
res := search.GetResults()
|
||||||
fmt.Println("Results:", res)
|
fmt.Println("Results:", res)
|
||||||
@@ -105,9 +102,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cli.Run(context.Background(), os.Args)
|
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ package duckduckgo
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
@@ -71,12 +70,6 @@ type Result struct {
|
|||||||
Description string
|
Description string
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
|
func (c Config) OpenSearch(ctx context.Context, b extractor.Browser, query string) (SearchPage, error) {
|
||||||
u := c.ToSearchURL(query)
|
u := c.ToSearchURL(query)
|
||||||
|
|
||||||
@@ -97,7 +90,7 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
|||||||
|
|
||||||
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
slog.Info("searching", "url", u, "query", query, "config", c, "browser", b)
|
||||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||||
|
|||||||
116
sites/duckduckgo/duckduckgo_test.go
Normal file
116
sites/duckduckgo/duckduckgo_test.go
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
package duckduckgo
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_Default(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("test query")
|
||||||
|
|
||||||
|
if u.Host != "duckduckgo.com" {
|
||||||
|
t.Errorf("Host = %q, want %q", u.Host, "duckduckgo.com")
|
||||||
|
}
|
||||||
|
|
||||||
|
if u.Query().Get("q") != "test query" {
|
||||||
|
t.Errorf("q = %q, want %q", u.Query().Get("q"), "test query")
|
||||||
|
}
|
||||||
|
|
||||||
|
if u.Query().Get("kp") != "-2" {
|
||||||
|
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-2")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_SafeSearchOn(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOn}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Query().Get("kp") != "1" {
|
||||||
|
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "1")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_SafeSearchModerate(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchModerate}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Query().Get("kp") != "-1" {
|
||||||
|
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-1")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_SafeSearchOff(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Query().Get("kp") != "-2" {
|
||||||
|
t.Errorf("kp = %q, want %q", u.Query().Get("kp"), "-2")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_WithRegion(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff, Region: "us-en"}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Query().Get("kl") != "us-en" {
|
||||||
|
t.Errorf("kl = %q, want %q", u.Query().Get("kl"), "us-en")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_WithQuery(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("golang testing")
|
||||||
|
|
||||||
|
if u.Query().Get("q") != "golang testing" {
|
||||||
|
t.Errorf("q = %q, want %q", u.Query().Get("q"), "golang testing")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_Validate_DefaultsSafeSearch(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: 0}
|
||||||
|
c = c.validate()
|
||||||
|
|
||||||
|
if c.SafeSearch != SafeSearchOff {
|
||||||
|
t.Errorf("validate() SafeSearch = %d, want %d (SafeSearchOff)", c.SafeSearch, SafeSearchOff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_NoRegion(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Query().Get("kl") != "" {
|
||||||
|
t.Errorf("kl should be empty when no region, got %q", u.Query().Get("kl"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_Scheme(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("test")
|
||||||
|
|
||||||
|
if u.Scheme != "https" {
|
||||||
|
t.Errorf("Scheme = %q, want %q", u.Scheme, "https")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_ToSearchURL_SpecialChars(t *testing.T) {
|
||||||
|
c := Config{SafeSearch: SafeSearchOff}
|
||||||
|
u := c.ToSearchURL("go lang & testing")
|
||||||
|
|
||||||
|
if u.Query().Get("q") != "go lang & testing" {
|
||||||
|
t.Errorf("q = %q, want %q", u.Query().Get("q"), "go lang & testing")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResult_ZeroValue(t *testing.T) {
|
||||||
|
var r Result
|
||||||
|
if r.URL != "" || r.Title != "" || r.Description != "" {
|
||||||
|
t.Error("zero-value Result should have empty fields")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDefaultConfig_SafeSearch(t *testing.T) {
|
||||||
|
if DefaultConfig.SafeSearch != SafeSearchOff {
|
||||||
|
t.Errorf("DefaultConfig.SafeSearch = %d, want %d", DefaultConfig.SafeSearch, SafeSearchOff)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,12 +3,12 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
|
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/google"
|
||||||
)
|
)
|
||||||
@@ -42,12 +42,6 @@ func (f GoogleFlags) ToConfig(_ context.Context, cmd *cli.Command) google.Config
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
var flags []cli.Flag
|
var flags []cli.Flag
|
||||||
|
|
||||||
@@ -67,7 +61,7 @@ func main() {
|
|||||||
|
|
||||||
b, err := browser.FromCommand(ctx, cli)
|
b, err := browser.FromCommand(ctx, cli)
|
||||||
|
|
||||||
defer deferClose(b)
|
defer extractor.DeferClose(b)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -87,9 +81,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cli.Run(context.Background(), os.Args)
|
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ package google
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
@@ -48,23 +47,20 @@ type Result struct {
|
|||||||
Description string
|
Description string
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]Result, error) {
|
||||||
c = c.validate()
|
c = c.validate()
|
||||||
|
|
||||||
u, err := url.Parse(fmt.Sprintf("https://%s/search?q=%s", c.BaseURL, query))
|
u, err := url.Parse(fmt.Sprintf("https://%s/search", c.BaseURL))
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("invalid url: %w", err)
|
return nil, fmt.Errorf("invalid url: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vals := u.Query()
|
||||||
|
vals.Set("q", query)
|
||||||
|
|
||||||
if c.Language != "" {
|
if c.Language != "" {
|
||||||
u.Query().Set("hl", c.Language)
|
vals.Set("hl", c.Language)
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.Country != "" {
|
if c.Country != "" {
|
||||||
@@ -84,17 +80,19 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
|||||||
}
|
}
|
||||||
|
|
||||||
if country != "" {
|
if country != "" {
|
||||||
u.Query().Set("cr", country)
|
vals.Set("cr", country)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u.RawQuery = vals.Encode()
|
||||||
|
|
||||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to open url: %w", err)
|
return nil, fmt.Errorf("failed to open url: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
var res []Result
|
var res []Result
|
||||||
|
|
||||||
|
|||||||
39
sites/google/google_test.go
Normal file
39
sites/google/google_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package google
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestConfig_Validate_Defaults(t *testing.T) {
|
||||||
|
c := Config{}
|
||||||
|
c = c.validate()
|
||||||
|
|
||||||
|
if c.BaseURL != "google.com" {
|
||||||
|
t.Errorf("BaseURL = %q, want %q", c.BaseURL, "google.com")
|
||||||
|
}
|
||||||
|
if c.Language != "en" {
|
||||||
|
t.Errorf("Language = %q, want %q", c.Language, "en")
|
||||||
|
}
|
||||||
|
if c.Country != "us" {
|
||||||
|
t.Errorf("Country = %q, want %q", c.Country, "us")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_Validate_Preserves(t *testing.T) {
|
||||||
|
c := Config{
|
||||||
|
BaseURL: "google.co.uk",
|
||||||
|
Language: "fr",
|
||||||
|
Country: "uk",
|
||||||
|
}
|
||||||
|
c = c.validate()
|
||||||
|
|
||||||
|
if c.BaseURL != "google.co.uk" {
|
||||||
|
t.Errorf("BaseURL = %q, want %q", c.BaseURL, "google.co.uk")
|
||||||
|
}
|
||||||
|
if c.Language != "fr" {
|
||||||
|
t.Errorf("Language = %q, want %q", c.Language, "fr")
|
||||||
|
}
|
||||||
|
if c.Country != "uk" {
|
||||||
|
t.Errorf("Country = %q, want %q", c.Country, "uk")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -51,10 +51,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cli.Run(context.Background(), os.Args)
|
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ package megamillions
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -33,12 +32,6 @@ type NextDrawing struct {
|
|||||||
Jackpot currency.Amount
|
Jackpot currency.Amount
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func netTicksToTime(t int64) time.Time {
|
func netTicksToTime(t int64) time.Time {
|
||||||
return time.Unix(0, t*100).Add(-621355968000000000)
|
return time.Unix(0, t*100).Add(-621355968000000000)
|
||||||
}
|
}
|
||||||
@@ -64,7 +57,6 @@ func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
|||||||
return nil, fmt.Errorf("failed to parse date: %w", err)
|
return nil, fmt.Errorf("failed to parse date: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("ticks", ticks)
|
|
||||||
drawing.Date = netTicksToTime(ticks)
|
drawing.Date = netTicksToTime(ticks)
|
||||||
|
|
||||||
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
|
err = doc.ForEach("ul.numbers li.ball", func(n extractor.Node) error {
|
||||||
@@ -199,23 +191,12 @@ func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, er
|
|||||||
|
|
||||||
numeric := numericOnly(txt)
|
numeric := numericOnly(txt)
|
||||||
|
|
||||||
set := false
|
|
||||||
if strings.Contains(txt, "Billion") {
|
if strings.Contains(txt, "Billion") {
|
||||||
amt := currency.USD.Amount(numeric * 1000000000)
|
nextDrawing.Jackpot = currency.USD.Amount(numeric * 1000000000)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
} else if strings.Contains(txt, "Million") {
|
} else if strings.Contains(txt, "Million") {
|
||||||
amt := currency.USD.Amount(numeric * 1000000)
|
nextDrawing.Jackpot = currency.USD.Amount(numeric * 1000000)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
} else {
|
} else {
|
||||||
amt := currency.USD.Amount(numeric)
|
nextDrawing.Jackpot = currency.USD.Amount(numeric)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if !set {
|
|
||||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &nextDrawing, nil
|
return &nextDrawing, nil
|
||||||
@@ -230,7 +211,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
|||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
d, err := getDrawing(ctx, doc)
|
d, err := getDrawing(ctx, doc)
|
||||||
|
|
||||||
|
|||||||
73
sites/megamillions/megamillions_test.go
Normal file
73
sites/megamillions/megamillions_test.go
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
package megamillions
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNetTicksToTime_Consistency(t *testing.T) {
|
||||||
|
// netTicksToTime converts .NET ticks to Go time.
|
||||||
|
// Verify it produces consistent results for the same input.
|
||||||
|
ticks := int64(638396256000000000)
|
||||||
|
t1 := netTicksToTime(ticks)
|
||||||
|
t2 := netTicksToTime(ticks)
|
||||||
|
|
||||||
|
if !t1.Equal(t2) {
|
||||||
|
t.Errorf("netTicksToTime is not consistent: %v != %v", t1, t2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetTicksToTime_Ordering(t *testing.T) {
|
||||||
|
// A larger ticks value should produce a later time.
|
||||||
|
earlier := netTicksToTime(638396256000000000)
|
||||||
|
later := netTicksToTime(638396256100000000) // 10 seconds later in ticks
|
||||||
|
|
||||||
|
if !later.After(earlier) {
|
||||||
|
t.Errorf("expected later ticks to produce later time: %v vs %v", earlier, later)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetTicksToTime_DifferenceIsCorrect(t *testing.T) {
|
||||||
|
// .NET ticks are 100-nanosecond intervals.
|
||||||
|
// 10,000,000 ticks = 1 second.
|
||||||
|
ticks1 := int64(638396256000000000)
|
||||||
|
ticks2 := ticks1 + 10000000 // 1 second later
|
||||||
|
|
||||||
|
t1 := netTicksToTime(ticks1)
|
||||||
|
t2 := netTicksToTime(ticks2)
|
||||||
|
|
||||||
|
diff := t2.Sub(t1)
|
||||||
|
if diff != time.Second {
|
||||||
|
t.Errorf("expected 1 second difference, got %v", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetTicksToTime_NotZero(t *testing.T) {
|
||||||
|
// Verify the function produces a non-zero time for typical ticks values.
|
||||||
|
ticks := int64(638396256000000000)
|
||||||
|
result := netTicksToTime(ticks)
|
||||||
|
|
||||||
|
if result.IsZero() {
|
||||||
|
t.Error("netTicksToTime should not return zero time for valid ticks")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConfig_Validate(t *testing.T) {
|
||||||
|
c := Config{}
|
||||||
|
c = c.validate()
|
||||||
|
_ = c // validate is a no-op, just verify no panic
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDrawing_ZeroValue(t *testing.T) {
|
||||||
|
var d Drawing
|
||||||
|
if d.MegaBall != 0 || d.Megaplier != 0 {
|
||||||
|
t.Error("zero-value Drawing should have zero fields")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||||
|
var nd NextDrawing
|
||||||
|
if nd.Date != "" {
|
||||||
|
t.Error("zero-value NextDrawing should have empty date")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -51,10 +51,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cli.Run(context.Background(), os.Args)
|
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,14 +3,11 @@ package powerball
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
|
|
||||||
"golang.org/x/text/currency"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
@@ -30,19 +27,28 @@ type Drawing struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NextDrawing struct {
|
type NextDrawing struct {
|
||||||
Date string
|
Date string
|
||||||
Jackpot currency.Amount
|
JackpotDollars int
|
||||||
}
|
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
func getDrawing(_ context.Context, doc extractor.Document) (*Drawing, error) {
|
||||||
var drawing Drawing
|
var drawing Drawing
|
||||||
|
|
||||||
|
dateNode := doc.SelectFirst("#numbers .title-date")
|
||||||
|
if dateNode == nil {
|
||||||
|
return nil, fmt.Errorf("failed to find date element")
|
||||||
|
}
|
||||||
|
|
||||||
|
dateStr, err := dateNode.Text()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get date text: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
drawing.Date, err = time.Parse("Mon, Jan 2, 2006", dateStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse date %q: %w", dateStr, err)
|
||||||
|
}
|
||||||
|
|
||||||
nums := doc.Select("div.game-ball-group div.white-balls")
|
nums := doc.Select("div.game-ball-group div.white-balls")
|
||||||
|
|
||||||
if len(nums) != 5 {
|
if len(nums) != 5 {
|
||||||
@@ -163,23 +169,12 @@ func getNextDrawing(_ context.Context, doc extractor.Document) (*NextDrawing, er
|
|||||||
|
|
||||||
numeric := numericOnly(txt)
|
numeric := numericOnly(txt)
|
||||||
|
|
||||||
set := false
|
|
||||||
if strings.Contains(txt, "Billion") {
|
if strings.Contains(txt, "Billion") {
|
||||||
amt := currency.USD.Amount(numeric * 1000000000)
|
nextDrawing.JackpotDollars = int(numeric * 1000000000)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
} else if strings.Contains(txt, "Million") {
|
} else if strings.Contains(txt, "Million") {
|
||||||
amt := currency.USD.Amount(numeric * 1000000)
|
nextDrawing.JackpotDollars = int(numeric * 1000000)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
} else {
|
} else {
|
||||||
amt := currency.USD.Amount(numeric)
|
nextDrawing.JackpotDollars = int(numeric)
|
||||||
nextDrawing.Jackpot = amt
|
|
||||||
set = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if !set {
|
|
||||||
return nil, fmt.Errorf("failed to convert jackpot to currency: %w", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &nextDrawing, nil
|
return &nextDrawing, nil
|
||||||
@@ -194,7 +189,7 @@ func (c Config) GetCurrent(ctx context.Context, b extractor.Browser) (*Drawing,
|
|||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
d, err := getDrawing(ctx, doc)
|
d, err := getDrawing(ctx, doc)
|
||||||
|
|
||||||
|
|||||||
34
sites/powerball/powerball_test.go
Normal file
34
sites/powerball/powerball_test.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package powerball
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestConfig_Validate(t *testing.T) {
|
||||||
|
c := Config{}
|
||||||
|
c = c.validate()
|
||||||
|
// validate is a no-op for powerball Config, just verify it doesn't panic.
|
||||||
|
_ = c
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDefaultConfig(t *testing.T) {
|
||||||
|
c := DefaultConfig
|
||||||
|
_ = c
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDrawing_ZeroValue(t *testing.T) {
|
||||||
|
var d Drawing
|
||||||
|
if d.PowerBall != 0 || d.PowerPlay != 0 {
|
||||||
|
t.Error("zero-value Drawing should have zero fields")
|
||||||
|
}
|
||||||
|
for i, n := range d.Numbers {
|
||||||
|
if n != 0 {
|
||||||
|
t.Errorf("Numbers[%d] = %d, want 0", i, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNextDrawing_ZeroValue(t *testing.T) {
|
||||||
|
var nd NextDrawing
|
||||||
|
if nd.Date != "" || nd.JackpotDollars != 0 {
|
||||||
|
t.Error("zero-value NextDrawing should have empty/zero fields")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -49,10 +49,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cli.Run(context.Background(), os.Args)
|
if err := cli.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
|
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -13,12 +11,6 @@ type Config struct{}
|
|||||||
|
|
||||||
var DefaultConfig = Config{}
|
var DefaultConfig = Config{}
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.Browser) (string, error) {
|
func GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.Browser) (string, error) {
|
||||||
return DefaultConfig.GetMostCommonDesktopUserAgent(ctx, b)
|
return DefaultConfig.GetMostCommonDesktopUserAgent(ctx, b)
|
||||||
}
|
}
|
||||||
@@ -30,7 +22,7 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
|||||||
return "", fmt.Errorf("failed to open useragents.me: %w", err)
|
return "", fmt.Errorf("failed to open useragents.me: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
s := doc.Select("#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea:nth-child(4)")
|
s := doc.Select("#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea:nth-child(4)")
|
||||||
|
|
||||||
text := ""
|
text := ""
|
||||||
@@ -44,8 +36,6 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
|||||||
}
|
}
|
||||||
data := []map[string]any{}
|
data := []map[string]any{}
|
||||||
|
|
||||||
fmt.Println("text", text)
|
|
||||||
|
|
||||||
err = json.Unmarshal([]byte(text), &data)
|
err = json.Unmarshal([]byte(text), &data)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -63,8 +53,12 @@ func (c Config) GetMostCommonDesktopUserAgent(ctx context.Context, b extractor.B
|
|||||||
}
|
}
|
||||||
|
|
||||||
if pct > highestPct {
|
if pct > highestPct {
|
||||||
|
ua, ok := agent["ua"].(string)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
highestPct = pct
|
highestPct = pct
|
||||||
highestAgent = agent["ua"].(string)
|
highestAgent = ua
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
9
sites/useragents/useragents_test.go
Normal file
9
sites/useragents/useragents_test.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package useragents
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestDefaultConfig(t *testing.T) {
|
||||||
|
// DefaultConfig should be a zero-value Config.
|
||||||
|
c := DefaultConfig
|
||||||
|
_ = c // Just verify it exists and is usable.
|
||||||
|
}
|
||||||
@@ -3,10 +3,10 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/cmd/browser/pkg/browser"
|
||||||
|
|
||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
@@ -14,12 +14,6 @@ import (
|
|||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/wegmans"
|
||||||
)
|
)
|
||||||
|
|
||||||
func deferClose(cl io.Closer) {
|
|
||||||
if cl != nil {
|
|
||||||
_ = cl.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type WegmansFlags []cli.Flag
|
type WegmansFlags []cli.Flag
|
||||||
|
|
||||||
var Flags = WegmansFlags{}
|
var Flags = WegmansFlags{}
|
||||||
@@ -44,7 +38,7 @@ func main() {
|
|||||||
cfg := Flags.ToConfig(cmd)
|
cfg := Flags.ToConfig(cmd)
|
||||||
|
|
||||||
b, err := browser.FromCommand(ctx, cmd)
|
b, err := browser.FromCommand(ctx, cmd)
|
||||||
defer deferClose(b)
|
defer extractor.DeferClose(b)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error creating browser: %w", err)
|
return fmt.Errorf("error creating browser: %w", err)
|
||||||
@@ -73,9 +67,8 @@ func main() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
err := app.Run(context.Background(), os.Args)
|
if err := app.Run(context.Background(), os.Args); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
if err != nil {
|
os.Exit(1)
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ package wegmans
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"io"
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -29,12 +29,6 @@ type Item struct {
|
|||||||
Unit string
|
Unit string
|
||||||
}
|
}
|
||||||
|
|
||||||
func deferClose(c io.Closer) {
|
|
||||||
if c != nil {
|
|
||||||
_ = c.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.URL) (Item, error) {
|
||||||
|
|
||||||
if b == nil {
|
if b == nil {
|
||||||
@@ -67,7 +61,7 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
}
|
}
|
||||||
|
|
||||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
defer deferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Item{}, err
|
return Item{}, err
|
||||||
@@ -80,23 +74,28 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
ID: id,
|
ID: id,
|
||||||
}
|
}
|
||||||
|
|
||||||
titles := doc.Select("h1[data-test]")
|
titles := doc.Select("h1[data-testid]")
|
||||||
|
|
||||||
if len(titles) != 0 {
|
if len(titles) != 0 {
|
||||||
res.Name, _ = titles[0].Text()
|
res.Name, _ = titles[0].Text()
|
||||||
}
|
}
|
||||||
|
|
||||||
prices := doc.Select("span[data-test=\"amount\"] span:nth-child(1)")
|
prices := doc.Select("div.component--product-price:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(2)")
|
||||||
|
|
||||||
|
slog.Info("prices", "len", len(prices))
|
||||||
if len(prices) != 0 {
|
if len(prices) != 0 {
|
||||||
priceStr, _ := prices[0].Text()
|
priceStr, _ := prices[0].Text()
|
||||||
|
slog.Info("price", "0", prices[0], "text", priceStr)
|
||||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||||
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
||||||
|
// if there's a "/" in the price, then it's in the format of like "1.99/ea", so split it off
|
||||||
|
priceStr = strings.Split(priceStr, "/")[0]
|
||||||
price, _ := strconv.ParseFloat(priceStr, 64)
|
price, _ := strconv.ParseFloat(priceStr, 64)
|
||||||
|
slog.Info("price", "0", prices[0], "text", priceStr, "price", price)
|
||||||
res.Price = price
|
res.Price = price
|
||||||
}
|
}
|
||||||
|
|
||||||
unitPrices := doc.Select(`span[data-test="per-unit-price"]`)
|
unitPrices := doc.Select(`div.component--product-price:nth-child(1) span.price-per-unit`)
|
||||||
|
|
||||||
if len(unitPrices) != 0 {
|
if len(unitPrices) != 0 {
|
||||||
unitPriceStr, _ := unitPrices[0].Text()
|
unitPriceStr, _ := unitPrices[0].Text()
|
||||||
@@ -111,8 +110,15 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
if len(units) > 1 {
|
if len(units) > 1 {
|
||||||
res.Unit = strings.TrimSpace(units[1])
|
res.Unit = strings.TrimSpace(units[1])
|
||||||
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
|
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
|
||||||
|
|
||||||
|
// the unit might be like "lb.", so if it ends in a period, then just strip it off
|
||||||
|
if strings.HasSuffix(res.Unit, ".") {
|
||||||
|
res.Unit = strings.TrimSuffix(res.Unit, ".")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Info("res", "res", res)
|
||||||
|
|
||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|||||||
39
sites/wegmans/wegmans_test.go
Normal file
39
sites/wegmans/wegmans_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package wegmans
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/url"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetItemPrice_NilBrowser(t *testing.T) {
|
||||||
|
u, _ := url.Parse("https://shop.wegmans.com/product/24921")
|
||||||
|
_, err := DefaultConfig.GetItemPrice(context.Background(), nil, u)
|
||||||
|
if err != ErrNilBrowser {
|
||||||
|
t.Errorf("expected ErrNilBrowser, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetItemPrice_NilURL(t *testing.T) {
|
||||||
|
// NilBrowser check comes before NilURL, so we can't test NilURL
|
||||||
|
// independently without a real browser. Verify the error sentinel exists.
|
||||||
|
if ErrNilURL.Error() != "url is nil" {
|
||||||
|
t.Errorf("ErrNilURL = %q, want %q", ErrNilURL.Error(), "url is nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetItemPrice_ErrorSentinels(t *testing.T) {
|
||||||
|
if ErrInvalidURL.Error() != "invalid url" {
|
||||||
|
t.Errorf("ErrInvalidURL = %q, want %q", ErrInvalidURL.Error(), "invalid url")
|
||||||
|
}
|
||||||
|
if ErrNilBrowser.Error() != "browser is nil" {
|
||||||
|
t.Errorf("ErrNilBrowser = %q, want %q", ErrNilBrowser.Error(), "browser is nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestItem_ZeroValue(t *testing.T) {
|
||||||
|
var item Item
|
||||||
|
if item.ID != 0 || item.Name != "" || item.Price != 0 || item.UnitPrice != 0 || item.Unit != "" {
|
||||||
|
t.Error("zero-value Item should have empty/zero fields")
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user