docs: add README.md and CLAUDE.md
Add project documentation: - README.md with installation, usage examples, API reference, and project structure - CLAUDE.md with developer guide, architecture overview, conventions, and issue label docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
253
README.md
Normal file
253
README.md
Normal file
@@ -0,0 +1,253 @@
|
||||
# go-extractor
|
||||
|
||||
A Go library for browser-based web scraping and content extraction, powered by [Playwright](https://playwright.dev/).
|
||||
|
||||
## Features
|
||||
|
||||
- **Browser automation** via Playwright (Chromium, Firefox, WebKit)
|
||||
- **Readability extraction** — extract article content from any page using Mozilla's readability algorithm
|
||||
- **Interactive browser control** — mouse, keyboard, screenshots for remote browser sessions
|
||||
- **Cookie management** — load/save cookies from `cookies.txt` files, read-only cookie jars
|
||||
- **Remote browser support** — connect to Playwright server instances or fall back to local browsers
|
||||
- **Site-specific extractors** for:
|
||||
- DuckDuckGo search (with pagination)
|
||||
- Google search
|
||||
- Powerball lottery results
|
||||
- Mega Millions lottery results
|
||||
- Wegmans grocery prices
|
||||
- AisleGopher grocery prices
|
||||
- archive.ph archival
|
||||
- useragents.me user-agent lookup
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
go get gitea.stevedudenhoeffer.com/steve/go-extractor
|
||||
```
|
||||
|
||||
Playwright browsers must be installed:
|
||||
|
||||
```bash
|
||||
go run github.com/playwright-community/playwright-go/cmd/playwright install
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Extract article content from a URL
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
extractor "gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
func main() {
|
||||
ctx := context.Background()
|
||||
|
||||
browser, err := extractor.NewBrowser(ctx)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer browser.Close()
|
||||
|
||||
doc, err := browser.Open(ctx, "https://example.com/article", extractor.OpenPageOptions{})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer doc.Close()
|
||||
|
||||
article, err := extractor.Readability(ctx, doc)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Println("Title:", article.Title)
|
||||
fmt.Println("Content:", article.TextContent)
|
||||
}
|
||||
```
|
||||
|
||||
### Take a screenshot
|
||||
|
||||
```go
|
||||
data, err := extractor.Screenshot(ctx, "https://example.com", 30*time.Second)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
os.WriteFile("screenshot.png", data, 0644)
|
||||
```
|
||||
|
||||
### Search DuckDuckGo
|
||||
|
||||
```go
|
||||
import "gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
||||
|
||||
results, err := duckduckgo.DefaultConfig.Search(ctx, browser, "golang web scraping")
|
||||
for _, r := range results {
|
||||
fmt.Printf("%s - %s\n", r.Title, r.URL)
|
||||
}
|
||||
```
|
||||
|
||||
### Use with Playwright server
|
||||
|
||||
Set environment variables to connect to a remote Playwright instance:
|
||||
|
||||
```bash
|
||||
export PLAYWRIGHT_SERVER_ADDRESS_FIREFOX=ws://playwright-server:3000
|
||||
export PLAYWRIGHT_SERVER_ADDRESS_CHROMIUM=ws://playwright-server:3001
|
||||
```
|
||||
|
||||
Or pass the address directly:
|
||||
|
||||
```go
|
||||
browser, err := extractor.NewBrowser(ctx, extractor.BrowserOptions{
|
||||
ServerAddress: "ws://playwright-server:3000",
|
||||
RequireServer: true, // fail instead of falling back to local
|
||||
})
|
||||
```
|
||||
|
||||
## Browser Options
|
||||
|
||||
```go
|
||||
extractor.BrowserOptions{
|
||||
UserAgent: "custom-agent", // defaults to a recent Firefox UA
|
||||
Browser: extractor.BrowserFirefox, // or BrowserChromium, BrowserWebKit
|
||||
Timeout: &timeout, // default 30s, 0 for no timeout
|
||||
CookieJar: jar, // load/save cookies automatically
|
||||
ShowBrowser: true, // show browser window (non-headless)
|
||||
Dimensions: extractor.Size{1280, 720},
|
||||
DarkMode: true,
|
||||
ServerAddress: "ws://...", // remote Playwright server
|
||||
RequireServer: true, // don't fall back to local browser
|
||||
UseLocalOnly: true, // don't try remote server
|
||||
}
|
||||
```
|
||||
|
||||
## DOM Interaction
|
||||
|
||||
Documents and Nodes expose CSS selector-based DOM manipulation:
|
||||
|
||||
```go
|
||||
// Select elements
|
||||
nodes := doc.Select("div.results a")
|
||||
first := doc.SelectFirst("h1")
|
||||
|
||||
// Extract text
|
||||
text, err := first.Text()
|
||||
content, err := first.Content()
|
||||
href, err := first.Attr("href")
|
||||
|
||||
// Interact
|
||||
err = first.Click()
|
||||
err = first.Type("hello world")
|
||||
|
||||
// Iterate
|
||||
err = doc.ForEach("li.item", func(n extractor.Node) error {
|
||||
text, _ := n.Text()
|
||||
fmt.Println(text)
|
||||
return nil
|
||||
})
|
||||
|
||||
// Modify
|
||||
err = first.SetHidden(true)
|
||||
err = first.SetAttribute("data-processed", "true")
|
||||
```
|
||||
|
||||
## Cookie Management
|
||||
|
||||
Load cookies from a Netscape `cookies.txt` file:
|
||||
|
||||
```go
|
||||
jar, err := extractor.LoadCookiesFile("cookies.txt")
|
||||
browser, err := extractor.NewBrowser(ctx, extractor.BrowserOptions{
|
||||
CookieJar: jar,
|
||||
})
|
||||
```
|
||||
|
||||
Use a read-only cookie jar (cookies are loaded but changes aren't saved back):
|
||||
|
||||
```go
|
||||
roJar := extractor.ReadOnlyCookieJar{Jar: jar}
|
||||
```
|
||||
|
||||
## Interactive Browser
|
||||
|
||||
For remote browser control with mouse/keyboard:
|
||||
|
||||
```go
|
||||
ib, err := extractor.NewInteractiveBrowser(ctx)
|
||||
defer ib.Close()
|
||||
|
||||
url, err := ib.Navigate("https://example.com")
|
||||
err = ib.MouseClick(100, 200, "left")
|
||||
err = ib.KeyboardType("search query")
|
||||
err = ib.KeyboardPress("Enter")
|
||||
screenshot, err := ib.Screenshot(80) // JPEG quality 0-100
|
||||
```
|
||||
|
||||
## Command-Line Tools
|
||||
|
||||
The `cmd/` and `sites/*/cmd/` directories contain CLI tools:
|
||||
|
||||
```bash
|
||||
# Extract article from URL
|
||||
go run ./cmd/browser https://example.com/article
|
||||
|
||||
# Search DuckDuckGo
|
||||
go run ./sites/duckduckgo/cmd/duckduckgo "golang tutorial"
|
||||
|
||||
# Search Google
|
||||
go run ./sites/google/cmd/google "golang tutorial"
|
||||
|
||||
# Get Powerball results
|
||||
go run ./sites/powerball/cmd/powerball
|
||||
|
||||
# Get Mega Millions results
|
||||
go run ./sites/megamillions/cmd/megamillions
|
||||
|
||||
# Archive a page
|
||||
go run ./sites/archive/cmd/archive https://example.com/page
|
||||
|
||||
# Get most common user agent
|
||||
go run ./sites/useragents/cmd/useragents
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
go-extractor/
|
||||
├── article.go # Article struct (readability output)
|
||||
├── browser.go # Browser interface and Playwright implementation
|
||||
├── browser_init.go # Browser initialization and option merging
|
||||
├── close.go # DeferClose helper
|
||||
├── cookiejar.go # Cookie/CookieJar types and ReadOnlyCookieJar
|
||||
├── cookies_txt.go # cookies.txt file parser and staticCookieJar
|
||||
├── document.go # Document interface (page wrapper)
|
||||
├── interactive.go # InteractiveBrowser for remote control
|
||||
├── node.go # Node interface (DOM element wrapper)
|
||||
├── nodes.go # Nodes collection type
|
||||
├── playwright.go # Playwright browser implementation
|
||||
├── readability.go # Readability article extraction
|
||||
├── cmd/
|
||||
│ └── browser/ # CLI tool for article extraction
|
||||
├── sites/
|
||||
│ ├── aislegopher/ # AisleGopher price extraction
|
||||
│ ├── archive/ # archive.ph integration
|
||||
│ ├── duckduckgo/ # DuckDuckGo search
|
||||
│ ├── google/ # Google search
|
||||
│ ├── megamillions/ # Mega Millions lottery
|
||||
│ ├── powerball/ # Powerball lottery
|
||||
│ ├── useragents/ # useragents.me lookup
|
||||
│ └── wegmans/ # Wegmans price extraction
|
||||
└── *_test.go # Unit tests
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Go 1.24+
|
||||
- Playwright browsers installed (`playwright install`)
|
||||
- Optional: Playwright server for remote browser execution
|
||||
Reference in New Issue
Block a user