feat: add ReadabilityWithOptions for DOM cleanup before extraction
All checks were successful
CI / build (pull_request) Successful in 46s
CI / test (pull_request) Successful in 48s
CI / vet (pull_request) Successful in 1m50s

Sites with infinite scroll (e.g. The Verge) load additional articles
into the DOM, which get included in readability extraction. Add
ReadabilityOptions.RemoveSelectors to strip elements by CSS selector
before parsing, avoiding the need to reimplement the readability
pipeline downstream.

Closes #60

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 01:09:28 +00:00
parent 3357972246
commit c1a5814732
4 changed files with 188 additions and 5 deletions

View File

@@ -2,6 +2,7 @@ package extractor
import (
"context"
"strings"
"testing"
)
@@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) {
t.Error("Readability() expected error for invalid URL, got nil")
}
}
func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) {
html := `<!DOCTYPE html>
<html>
<head><title>Main Article</title></head>
<body>
<article class="main-article">
<h1>Main Article</h1>
<p>This is the main article content that we want to extract properly.
It contains several sentences about the main topic of interest. The
readability algorithm should pick this up as the primary content of
the page without any interference from other elements.</p>
<p>Here is a second paragraph with more relevant content about the
main topic. This paragraph adds depth and detail to the article.</p>
</article>
<div class="infinite-scroll">
<article class="next-article">
<h2>Unrelated Article</h2>
<p>This is content from an unrelated article loaded via infinite scroll.
It should not appear in the extracted content because we will remove it
using the RemoveSelectors option before readability extraction.</p>
</article>
</div>
<aside class="sidebar">
<p>Sidebar content that should also be removed from extraction.</p>
</aside>
</body>
</html>`
doc := mockDocument{
url: "https://example.com/article",
content: html,
}
article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{
RemoveSelectors: []string{".infinite-scroll", ".sidebar"},
})
if err != nil {
t.Fatalf("ReadabilityWithOptions() error = %v", err)
}
if article.TextContent == "" {
t.Fatal("TextContent should not be empty")
}
if strings.Contains(article.TextContent, "Unrelated Article") {
t.Error("TextContent should not contain content from removed .infinite-scroll element")
}
if strings.Contains(article.TextContent, "Sidebar content") {
t.Error("TextContent should not contain content from removed .sidebar element")
}
if !strings.Contains(article.TextContent, "main article content") {
t.Error("TextContent should still contain the main article content")
}
}
func TestReadabilityWithOptions_NoSelectors(t *testing.T) {
html := `<!DOCTYPE html>
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1>Test Article</h1>
<p>This is a test article with enough content to be parsed by readability.
It needs to have a reasonable amount of text so the algorithm considers it
a valid article. Let us add several sentences to make sure this works
correctly. The readability library requires a minimum amount of content
to successfully extract an article from a page.</p>
<p>Here is another paragraph to add more content. We want to make sure
that the content is substantial enough for the readability algorithm to
consider this a valid article and extract the text properly.</p>
</article>
</body>
</html>`
doc := mockDocument{
url: "https://example.com/article",
content: html,
}
// With empty options, should behave identically to Readability().
article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
if err != nil {
t.Fatalf("ReadabilityWithOptions() error = %v", err)
}
if article.Title != "Test Article" {
t.Errorf("Title = %q, want %q", article.Title, "Test Article")
}
if article.TextContent == "" {
t.Error("TextContent should not be empty")
}
}
func TestRemoveSelectors(t *testing.T) {
html := `<html><body><div class="keep">Keep this</div><div class="remove">Remove this</div></body></html>`
result, err := removeSelectors(html, []string{".remove"})
if err != nil {
t.Fatalf("removeSelectors() error = %v", err)
}
if strings.Contains(result, "Remove this") {
t.Error("result should not contain removed element content")
}
if !strings.Contains(result, "Keep this") {
t.Error("result should still contain kept element content")
}
}
func TestRemoveSelectors_MultipleSelectors(t *testing.T) {
html := `<html><body><div class="a">A</div><div class="b">B</div><div class="c">C</div></body></html>`
result, err := removeSelectors(html, []string{".a", ".c"})
if err != nil {
t.Fatalf("removeSelectors() error = %v", err)
}
if strings.Contains(result, ">A<") {
t.Error("result should not contain element .a")
}
if strings.Contains(result, ">C<") {
t.Error("result should not contain element .c")
}
if !strings.Contains(result, ">B<") {
t.Error("result should still contain element .b")
}
}