diff --git a/go.mod b/go.mod
index 3f0f183..f17b743 100644
--- a/go.mod
+++ b/go.mod
@@ -8,10 +8,11 @@ require (
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
github.com/playwright-community/playwright-go v0.5200.0
github.com/urfave/cli/v3 v3.0.0-beta1
- golang.org/x/text v0.29.0
+ golang.org/x/text v0.31.0
)
require (
+ github.com/PuerkitoBio/goquery v1.11.0 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/deckarep/golang-set/v2 v2.8.0 // indirect
@@ -19,5 +20,5 @@ require (
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
- golang.org/x/net v0.44.0 // indirect
+ golang.org/x/net v0.47.0 // indirect
)
diff --git a/go.sum b/go.sum
index 3cbc801..de29e4b 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,5 @@
+github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
+github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
@@ -59,6 +61,8 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -97,6 +101,8 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
diff --git a/readability.go b/readability.go
index 74aff38..933dac4 100644
--- a/readability.go
+++ b/readability.go
@@ -3,25 +3,50 @@ package extractor
import (
"bytes"
"context"
+ "fmt"
"net/url"
+ "strings"
+ "github.com/PuerkitoBio/goquery"
"github.com/go-shiori/go-readability"
)
+// ReadabilityOptions configures the readability extraction process.
+type ReadabilityOptions struct {
+ // RemoveSelectors is a list of CSS selectors for elements to remove from
+ // the DOM before readability extraction. This is useful for stripping
+ // infinite-scroll content, related articles, or other elements that
+ // pollute the extracted article.
+ RemoveSelectors []string
+}
+
+// Readability extracts article content from a document using the readability algorithm.
func Readability(_ context.Context, doc Document) (Article, error) {
+ return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
+}
+
+// ReadabilityWithOptions extracts article content from a document, applying
+// the provided options before extraction. Use RemoveSelectors to strip
+// elements (e.g. infinite-scroll articles) from the DOM before parsing.
+func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
data, err := doc.Content()
if err != nil {
return Article{}, err
}
u, err := url.Parse(doc.URL())
-
if err != nil {
return Article{}, err
}
- a, err := readability.FromReader(bytes.NewBufferString(data), u)
+ if len(opts.RemoveSelectors) > 0 {
+ data, err = removeSelectors(data, opts.RemoveSelectors)
+ if err != nil {
+ return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
+ }
+ }
+ a, err := readability.FromReader(bytes.NewBufferString(data), u)
if err != nil {
return Article{}, err
}
@@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) {
Lang: a.Language,
PublishedTime: pubTime,
}, nil
-
+}
+
+// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
+func removeSelectors(html string, selectors []string) (string, error) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ return "", fmt.Errorf("failed to parse HTML: %w", err)
+ }
+
+ for _, sel := range selectors {
+ doc.Find(sel).Remove()
+ }
+
+ result, err := doc.Html()
+ if err != nil {
+ return "", fmt.Errorf("failed to serialize HTML: %w", err)
+ }
+
+ return result, nil
}
diff --git a/readability_test.go b/readability_test.go
index 6033bfc..a5ca55f 100644
--- a/readability_test.go
+++ b/readability_test.go
@@ -2,6 +2,7 @@ package extractor
import (
"context"
+ "strings"
"testing"
)
@@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) {
t.Error("Readability() expected error for invalid URL, got nil")
}
}
+
+func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) {
+ html := `
+
+
Main Article
+
+
+Main Article
+This is the main article content that we want to extract properly.
+It contains several sentences about the main topic of interest. The
+readability algorithm should pick this up as the primary content of
+the page without any interference from other elements.
+Here is a second paragraph with more relevant content about the
+main topic. This paragraph adds depth and detail to the article.
+
+
+
+
+`
+
+ doc := mockDocument{
+ url: "https://example.com/article",
+ content: html,
+ }
+
+ article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{
+ RemoveSelectors: []string{".infinite-scroll", ".sidebar"},
+ })
+ if err != nil {
+ t.Fatalf("ReadabilityWithOptions() error = %v", err)
+ }
+
+ if article.TextContent == "" {
+ t.Fatal("TextContent should not be empty")
+ }
+
+ if strings.Contains(article.TextContent, "Unrelated Article") {
+ t.Error("TextContent should not contain content from removed .infinite-scroll element")
+ }
+
+ if strings.Contains(article.TextContent, "Sidebar content") {
+ t.Error("TextContent should not contain content from removed .sidebar element")
+ }
+
+ if !strings.Contains(article.TextContent, "main article content") {
+ t.Error("TextContent should still contain the main article content")
+ }
+}
+
+func TestReadabilityWithOptions_NoSelectors(t *testing.T) {
+ html := `
+
+Test Article
+
+
+Test Article
+This is a test article with enough content to be parsed by readability.
+It needs to have a reasonable amount of text so the algorithm considers it
+a valid article. Let us add several sentences to make sure this works
+correctly. The readability library requires a minimum amount of content
+to successfully extract an article from a page.
+Here is another paragraph to add more content. We want to make sure
+that the content is substantial enough for the readability algorithm to
+consider this a valid article and extract the text properly.
+
+
+`
+
+ doc := mockDocument{
+ url: "https://example.com/article",
+ content: html,
+ }
+
+ // With empty options, should behave identically to Readability().
+ article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
+ if err != nil {
+ t.Fatalf("ReadabilityWithOptions() error = %v", err)
+ }
+
+ if article.Title != "Test Article" {
+ t.Errorf("Title = %q, want %q", article.Title, "Test Article")
+ }
+
+ if article.TextContent == "" {
+ t.Error("TextContent should not be empty")
+ }
+}
+
+func TestRemoveSelectors(t *testing.T) {
+ html := `Keep this
Remove this
`
+
+ result, err := removeSelectors(html, []string{".remove"})
+ if err != nil {
+ t.Fatalf("removeSelectors() error = %v", err)
+ }
+
+ if strings.Contains(result, "Remove this") {
+ t.Error("result should not contain removed element content")
+ }
+
+ if !strings.Contains(result, "Keep this") {
+ t.Error("result should still contain kept element content")
+ }
+}
+
+func TestRemoveSelectors_MultipleSelectors(t *testing.T) {
+ html := `A
B
C
`
+
+ result, err := removeSelectors(html, []string{".a", ".c"})
+ if err != nil {
+ t.Fatalf("removeSelectors() error = %v", err)
+ }
+
+ if strings.Contains(result, ">A<") {
+ t.Error("result should not contain element .a")
+ }
+ if strings.Contains(result, ">C<") {
+ t.Error("result should not contain element .c")
+ }
+ if !strings.Contains(result, ">B<") {
+ t.Error("result should still contain element .b")
+ }
+}