Merge pull request 'feat: add ReadabilityWithOptions for DOM cleanup' (#61) from feature/readability-remove-selectors into main

2026-02-19 01:11:38 +00:00
parent 3357972246 c1a5814732
commit c982b61bab
4 changed files with 188 additions and 5 deletions
@@ -8,10 +8,11 @@ require (
 	github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
 	github.com/playwright-community/playwright-go v0.5200.0
 	github.com/urfave/cli/v3 v3.0.0-beta1
-	golang.org/x/text v0.29.0
+	golang.org/x/text v0.31.0
 )

 require (
+	github.com/PuerkitoBio/goquery v1.11.0 // indirect
 	github.com/andybalholm/cascadia v1.3.3 // indirect
 	github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
 	github.com/deckarep/golang-set/v2 v2.8.0 // indirect
@@ -19,5 +20,5 @@ require (
 	github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
 	github.com/go-stack/stack v1.8.1 // indirect
 	github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
-	golang.org/x/net v0.44.0 // indirect
+	golang.org/x/net v0.47.0 // indirect
 )
@@ -1,3 +1,5 @@
+github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
+github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
 github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
 github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
 github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
@@ -59,6 +61,8 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
 golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
 golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -97,6 +101,8 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
 golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
 golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
@@ -3,25 +3,50 @@ package extractor
 import (
 	"bytes"
 	"context"
+	"fmt"
 	"net/url"
+	"strings"

+	"github.com/PuerkitoBio/goquery"
 	"github.com/go-shiori/go-readability"
 )

+// ReadabilityOptions configures the readability extraction process.
+type ReadabilityOptions struct {
+	// RemoveSelectors is a list of CSS selectors for elements to remove from
+	// the DOM before readability extraction. This is useful for stripping
+	// infinite-scroll content, related articles, or other elements that
+	// pollute the extracted article.
+	RemoveSelectors []string
+}
+
+// Readability extracts article content from a document using the readability algorithm.
 func Readability(_ context.Context, doc Document) (Article, error) {
+	return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
+}
+
+// ReadabilityWithOptions extracts article content from a document, applying
+// the provided options before extraction. Use RemoveSelectors to strip
+// elements (e.g. infinite-scroll articles) from the DOM before parsing.
+func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
 	data, err := doc.Content()
 	if err != nil {
 		return Article{}, err
 	}

 	u, err := url.Parse(doc.URL())
-
 	if err != nil {
 		return Article{}, err
 	}

-	a, err := readability.FromReader(bytes.NewBufferString(data), u)
+	if len(opts.RemoveSelectors) > 0 {
+		data, err = removeSelectors(data, opts.RemoveSelectors)
+		if err != nil {
+			return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
+		}
+	}

+	a, err := readability.FromReader(bytes.NewBufferString(data), u)
 	if err != nil {
 		return Article{}, err
 	}
@@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) {
 		Lang:          a.Language,
 		PublishedTime: pubTime,
 	}, nil
-
+}
+
+// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
+func removeSelectors(html string, selectors []string) (string, error) {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+	if err != nil {
+		return "", fmt.Errorf("failed to parse HTML: %w", err)
+	}
+
+	for _, sel := range selectors {
+		doc.Find(sel).Remove()
+	}
+
+	result, err := doc.Html()
+	if err != nil {
+		return "", fmt.Errorf("failed to serialize HTML: %w", err)
+	}
+
+	return result, nil
 }
@@ -2,6 +2,7 @@ package extractor

 import (
 	"context"
+	"strings"
 	"testing"
 )

@@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) {
 		t.Error("Readability() expected error for invalid URL, got nil")
 	}
 }
+
+func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) {
+	html := `<!DOCTYPE html>
+<html>
+<head><title>Main Article</title></head>
+<body>
+<article class="main-article">
+<h1>Main Article</h1>
+<p>This is the main article content that we want to extract properly.
+It contains several sentences about the main topic of interest. The
+readability algorithm should pick this up as the primary content of
+the page without any interference from other elements.</p>
+<p>Here is a second paragraph with more relevant content about the
+main topic. This paragraph adds depth and detail to the article.</p>
+</article>
+<div class="infinite-scroll">
+<article class="next-article">
+<h2>Unrelated Article</h2>
+<p>This is content from an unrelated article loaded via infinite scroll.
+It should not appear in the extracted content because we will remove it
+using the RemoveSelectors option before readability extraction.</p>
+</article>
+</div>
+<aside class="sidebar">
+<p>Sidebar content that should also be removed from extraction.</p>
+</aside>
+</body>
+</html>`
+
+	doc := mockDocument{
+		url:     "https://example.com/article",
+		content: html,
+	}
+
+	article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{
+		RemoveSelectors: []string{".infinite-scroll", ".sidebar"},
+	})
+	if err != nil {
+		t.Fatalf("ReadabilityWithOptions() error = %v", err)
+	}
+
+	if article.TextContent == "" {
+		t.Fatal("TextContent should not be empty")
+	}
+
+	if strings.Contains(article.TextContent, "Unrelated Article") {
+		t.Error("TextContent should not contain content from removed .infinite-scroll element")
+	}
+
+	if strings.Contains(article.TextContent, "Sidebar content") {
+		t.Error("TextContent should not contain content from removed .sidebar element")
+	}
+
+	if !strings.Contains(article.TextContent, "main article content") {
+		t.Error("TextContent should still contain the main article content")
+	}
+}
+
+func TestReadabilityWithOptions_NoSelectors(t *testing.T) {
+	html := `<!DOCTYPE html>
+<html>
+<head><title>Test Article</title></head>
+<body>
+<article>
+<h1>Test Article</h1>
+<p>This is a test article with enough content to be parsed by readability.
+It needs to have a reasonable amount of text so the algorithm considers it
+a valid article. Let us add several sentences to make sure this works
+correctly. The readability library requires a minimum amount of content
+to successfully extract an article from a page.</p>
+<p>Here is another paragraph to add more content. We want to make sure
+that the content is substantial enough for the readability algorithm to
+consider this a valid article and extract the text properly.</p>
+</article>
+</body>
+</html>`
+
+	doc := mockDocument{
+		url:     "https://example.com/article",
+		content: html,
+	}
+
+	// With empty options, should behave identically to Readability().
+	article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
+	if err != nil {
+		t.Fatalf("ReadabilityWithOptions() error = %v", err)
+	}
+
+	if article.Title != "Test Article" {
+		t.Errorf("Title = %q, want %q", article.Title, "Test Article")
+	}
+
+	if article.TextContent == "" {
+		t.Error("TextContent should not be empty")
+	}
+}
+
+func TestRemoveSelectors(t *testing.T) {
+	html := `<html><body><div class="keep">Keep this</div><div class="remove">Remove this</div></body></html>`
+
+	result, err := removeSelectors(html, []string{".remove"})
+	if err != nil {
+		t.Fatalf("removeSelectors() error = %v", err)
+	}
+
+	if strings.Contains(result, "Remove this") {
+		t.Error("result should not contain removed element content")
+	}
+
+	if !strings.Contains(result, "Keep this") {
+		t.Error("result should still contain kept element content")
+	}
+}
+
+func TestRemoveSelectors_MultipleSelectors(t *testing.T) {
+	html := `<html><body><div class="a">A</div><div class="b">B</div><div class="c">C</div></body></html>`
+
+	result, err := removeSelectors(html, []string{".a", ".c"})
+	if err != nil {
+		t.Fatalf("removeSelectors() error = %v", err)
+	}
+
+	if strings.Contains(result, ">A<") {
+		t.Error("result should not contain element .a")
+	}
+	if strings.Contains(result, ">C<") {
+		t.Error("result should not contain element .c")
+	}
+	if !strings.Contains(result, ">B<") {
+		t.Error("result should still contain element .b")
+	}
+}