From c1a5814732812fd5dc42b983a60fcdce9f5bf3c9 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Thu, 19 Feb 2026 01:09:28 +0000 Subject: [PATCH] feat: add ReadabilityWithOptions for DOM cleanup before extraction Sites with infinite scroll (e.g. The Verge) load additional articles into the DOM, which get included in readability extraction. Add ReadabilityOptions.RemoveSelectors to strip elements by CSS selector before parsing, avoiding the need to reimplement the readability pipeline downstream. Closes #60 Co-Authored-By: Claude Opus 4.6 --- go.mod | 5 +- go.sum | 6 ++ readability.go | 49 +++++++++++++++- readability_test.go | 133 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 3f0f183..f17b743 100644 --- a/go.mod +++ b/go.mod @@ -8,10 +8,11 @@ require ( github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 github.com/playwright-community/playwright-go v0.5200.0 github.com/urfave/cli/v3 v3.0.0-beta1 - golang.org/x/text v0.29.0 + golang.org/x/text v0.31.0 ) require ( + github.com/PuerkitoBio/goquery v1.11.0 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect @@ -19,5 +20,5 @@ require ( github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect - golang.org/x/net v0.44.0 // indirect + golang.org/x/net v0.47.0 // indirect ) diff --git a/go.sum b/go.sum index 3cbc801..de29e4b 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= @@ -59,6 +61,8 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -97,6 +101,8 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= diff --git a/readability.go b/readability.go index 74aff38..933dac4 100644 --- a/readability.go +++ b/readability.go @@ -3,25 +3,50 @@ package extractor import ( "bytes" "context" + "fmt" "net/url" + "strings" + "github.com/PuerkitoBio/goquery" "github.com/go-shiori/go-readability" ) +// ReadabilityOptions configures the readability extraction process. +type ReadabilityOptions struct { + // RemoveSelectors is a list of CSS selectors for elements to remove from + // the DOM before readability extraction. This is useful for stripping + // infinite-scroll content, related articles, or other elements that + // pollute the extracted article. + RemoveSelectors []string +} + +// Readability extracts article content from a document using the readability algorithm. func Readability(_ context.Context, doc Document) (Article, error) { + return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) +} + +// ReadabilityWithOptions extracts article content from a document, applying +// the provided options before extraction. Use RemoveSelectors to strip +// elements (e.g. infinite-scroll articles) from the DOM before parsing. +func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) { data, err := doc.Content() if err != nil { return Article{}, err } u, err := url.Parse(doc.URL()) - if err != nil { return Article{}, err } - a, err := readability.FromReader(bytes.NewBufferString(data), u) + if len(opts.RemoveSelectors) > 0 { + data, err = removeSelectors(data, opts.RemoveSelectors) + if err != nil { + return Article{}, fmt.Errorf("failed to clean DOM: %w", err) + } + } + a, err := readability.FromReader(bytes.NewBufferString(data), u) if err != nil { return Article{}, err } @@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) { Lang: a.Language, PublishedTime: pubTime, }, nil - +} + +// removeSelectors parses HTML and removes all elements matching the given CSS selectors. +func removeSelectors(html string, selectors []string) (string, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return "", fmt.Errorf("failed to parse HTML: %w", err) + } + + for _, sel := range selectors { + doc.Find(sel).Remove() + } + + result, err := doc.Html() + if err != nil { + return "", fmt.Errorf("failed to serialize HTML: %w", err) + } + + return result, nil } diff --git a/readability_test.go b/readability_test.go index 6033bfc..a5ca55f 100644 --- a/readability_test.go +++ b/readability_test.go @@ -2,6 +2,7 @@ package extractor import ( "context" + "strings" "testing" ) @@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) { t.Error("Readability() expected error for invalid URL, got nil") } } + +func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) { + html := ` + +Main Article + +
+

Main Article

+

This is the main article content that we want to extract properly. +It contains several sentences about the main topic of interest. The +readability algorithm should pick this up as the primary content of +the page without any interference from other elements.

+

Here is a second paragraph with more relevant content about the +main topic. This paragraph adds depth and detail to the article.

+
+
+
+

Unrelated Article

+

This is content from an unrelated article loaded via infinite scroll. +It should not appear in the extracted content because we will remove it +using the RemoveSelectors option before readability extraction.

+
+
+ + +` + + doc := mockDocument{ + url: "https://example.com/article", + content: html, + } + + article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ + RemoveSelectors: []string{".infinite-scroll", ".sidebar"}, + }) + if err != nil { + t.Fatalf("ReadabilityWithOptions() error = %v", err) + } + + if article.TextContent == "" { + t.Fatal("TextContent should not be empty") + } + + if strings.Contains(article.TextContent, "Unrelated Article") { + t.Error("TextContent should not contain content from removed .infinite-scroll element") + } + + if strings.Contains(article.TextContent, "Sidebar content") { + t.Error("TextContent should not contain content from removed .sidebar element") + } + + if !strings.Contains(article.TextContent, "main article content") { + t.Error("TextContent should still contain the main article content") + } +} + +func TestReadabilityWithOptions_NoSelectors(t *testing.T) { + html := ` + +Test Article + +
+

Test Article

+

This is a test article with enough content to be parsed by readability. +It needs to have a reasonable amount of text so the algorithm considers it +a valid article. Let us add several sentences to make sure this works +correctly. The readability library requires a minimum amount of content +to successfully extract an article from a page.

+

Here is another paragraph to add more content. We want to make sure +that the content is substantial enough for the readability algorithm to +consider this a valid article and extract the text properly.

+
+ +` + + doc := mockDocument{ + url: "https://example.com/article", + content: html, + } + + // With empty options, should behave identically to Readability(). + article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) + if err != nil { + t.Fatalf("ReadabilityWithOptions() error = %v", err) + } + + if article.Title != "Test Article" { + t.Errorf("Title = %q, want %q", article.Title, "Test Article") + } + + if article.TextContent == "" { + t.Error("TextContent should not be empty") + } +} + +func TestRemoveSelectors(t *testing.T) { + html := `
Keep this
Remove this
` + + result, err := removeSelectors(html, []string{".remove"}) + if err != nil { + t.Fatalf("removeSelectors() error = %v", err) + } + + if strings.Contains(result, "Remove this") { + t.Error("result should not contain removed element content") + } + + if !strings.Contains(result, "Keep this") { + t.Error("result should still contain kept element content") + } +} + +func TestRemoveSelectors_MultipleSelectors(t *testing.T) { + html := `
A
B
C
` + + result, err := removeSelectors(html, []string{".a", ".c"}) + if err != nil { + t.Fatalf("removeSelectors() error = %v", err) + } + + if strings.Contains(result, ">A<") { + t.Error("result should not contain element .a") + } + if strings.Contains(result, ">C<") { + t.Error("result should not contain element .c") + } + if !strings.Contains(result, ">B<") { + t.Error("result should still contain element .b") + } +}