Merge pull request 'feat: add ReadabilityWithOptions for DOM cleanup' (#61) from feature/readability-remove-selectors into main
This commit was merged in pull request #61.
This commit is contained in:
5
go.mod
5
go.mod
@@ -8,10 +8,11 @@ require (
|
||||
github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612
|
||||
github.com/playwright-community/playwright-go v0.5200.0
|
||||
github.com/urfave/cli/v3 v3.0.0-beta1
|
||||
golang.org/x/text v0.29.0
|
||||
golang.org/x/text v0.31.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.11.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
|
||||
github.com/deckarep/golang-set/v2 v2.8.0 // indirect
|
||||
@@ -19,5 +20,5 @@ require (
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
golang.org/x/net v0.44.0 // indirect
|
||||
golang.org/x/net v0.47.0 // indirect
|
||||
)
|
||||
|
||||
6
go.sum
6
go.sum
@@ -1,3 +1,5 @@
|
||||
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
|
||||
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
|
||||
@@ -59,6 +61,8 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
|
||||
golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
|
||||
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
|
||||
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
@@ -97,6 +101,8 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
|
||||
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
|
||||
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
|
||||
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
|
||||
@@ -3,25 +3,50 @@ package extractor
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/go-shiori/go-readability"
|
||||
)
|
||||
|
||||
// ReadabilityOptions configures the readability extraction process.
|
||||
type ReadabilityOptions struct {
|
||||
// RemoveSelectors is a list of CSS selectors for elements to remove from
|
||||
// the DOM before readability extraction. This is useful for stripping
|
||||
// infinite-scroll content, related articles, or other elements that
|
||||
// pollute the extracted article.
|
||||
RemoveSelectors []string
|
||||
}
|
||||
|
||||
// Readability extracts article content from a document using the readability algorithm.
|
||||
func Readability(_ context.Context, doc Document) (Article, error) {
|
||||
return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
|
||||
}
|
||||
|
||||
// ReadabilityWithOptions extracts article content from a document, applying
|
||||
// the provided options before extraction. Use RemoveSelectors to strip
|
||||
// elements (e.g. infinite-scroll articles) from the DOM before parsing.
|
||||
func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
|
||||
data, err := doc.Content()
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
u, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
||||
if len(opts.RemoveSelectors) > 0 {
|
||||
data, err = removeSelectors(data, opts.RemoveSelectors)
|
||||
if err != nil {
|
||||
return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
@@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) {
|
||||
Lang: a.Language,
|
||||
PublishedTime: pubTime,
|
||||
}, nil
|
||||
|
||||
}
|
||||
|
||||
// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
|
||||
func removeSelectors(html string, selectors []string) (string, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
for _, sel := range selectors {
|
||||
doc.Find(sel).Remove()
|
||||
}
|
||||
|
||||
result, err := doc.Html()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to serialize HTML: %w", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package extractor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) {
|
||||
t.Error("Readability() expected error for invalid URL, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) {
|
||||
html := `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Main Article</title></head>
|
||||
<body>
|
||||
<article class="main-article">
|
||||
<h1>Main Article</h1>
|
||||
<p>This is the main article content that we want to extract properly.
|
||||
It contains several sentences about the main topic of interest. The
|
||||
readability algorithm should pick this up as the primary content of
|
||||
the page without any interference from other elements.</p>
|
||||
<p>Here is a second paragraph with more relevant content about the
|
||||
main topic. This paragraph adds depth and detail to the article.</p>
|
||||
</article>
|
||||
<div class="infinite-scroll">
|
||||
<article class="next-article">
|
||||
<h2>Unrelated Article</h2>
|
||||
<p>This is content from an unrelated article loaded via infinite scroll.
|
||||
It should not appear in the extracted content because we will remove it
|
||||
using the RemoveSelectors option before readability extraction.</p>
|
||||
</article>
|
||||
</div>
|
||||
<aside class="sidebar">
|
||||
<p>Sidebar content that should also be removed from extraction.</p>
|
||||
</aside>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/article",
|
||||
content: html,
|
||||
}
|
||||
|
||||
article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{
|
||||
RemoveSelectors: []string{".infinite-scroll", ".sidebar"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ReadabilityWithOptions() error = %v", err)
|
||||
}
|
||||
|
||||
if article.TextContent == "" {
|
||||
t.Fatal("TextContent should not be empty")
|
||||
}
|
||||
|
||||
if strings.Contains(article.TextContent, "Unrelated Article") {
|
||||
t.Error("TextContent should not contain content from removed .infinite-scroll element")
|
||||
}
|
||||
|
||||
if strings.Contains(article.TextContent, "Sidebar content") {
|
||||
t.Error("TextContent should not contain content from removed .sidebar element")
|
||||
}
|
||||
|
||||
if !strings.Contains(article.TextContent, "main article content") {
|
||||
t.Error("TextContent should still contain the main article content")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadabilityWithOptions_NoSelectors(t *testing.T) {
|
||||
html := `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Article</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Test Article</h1>
|
||||
<p>This is a test article with enough content to be parsed by readability.
|
||||
It needs to have a reasonable amount of text so the algorithm considers it
|
||||
a valid article. Let us add several sentences to make sure this works
|
||||
correctly. The readability library requires a minimum amount of content
|
||||
to successfully extract an article from a page.</p>
|
||||
<p>Here is another paragraph to add more content. We want to make sure
|
||||
that the content is substantial enough for the readability algorithm to
|
||||
consider this a valid article and extract the text properly.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
doc := mockDocument{
|
||||
url: "https://example.com/article",
|
||||
content: html,
|
||||
}
|
||||
|
||||
// With empty options, should behave identically to Readability().
|
||||
article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("ReadabilityWithOptions() error = %v", err)
|
||||
}
|
||||
|
||||
if article.Title != "Test Article" {
|
||||
t.Errorf("Title = %q, want %q", article.Title, "Test Article")
|
||||
}
|
||||
|
||||
if article.TextContent == "" {
|
||||
t.Error("TextContent should not be empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveSelectors(t *testing.T) {
|
||||
html := `<html><body><div class="keep">Keep this</div><div class="remove">Remove this</div></body></html>`
|
||||
|
||||
result, err := removeSelectors(html, []string{".remove"})
|
||||
if err != nil {
|
||||
t.Fatalf("removeSelectors() error = %v", err)
|
||||
}
|
||||
|
||||
if strings.Contains(result, "Remove this") {
|
||||
t.Error("result should not contain removed element content")
|
||||
}
|
||||
|
||||
if !strings.Contains(result, "Keep this") {
|
||||
t.Error("result should still contain kept element content")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveSelectors_MultipleSelectors(t *testing.T) {
|
||||
html := `<html><body><div class="a">A</div><div class="b">B</div><div class="c">C</div></body></html>`
|
||||
|
||||
result, err := removeSelectors(html, []string{".a", ".c"})
|
||||
if err != nil {
|
||||
t.Fatalf("removeSelectors() error = %v", err)
|
||||
}
|
||||
|
||||
if strings.Contains(result, ">A<") {
|
||||
t.Error("result should not contain element .a")
|
||||
}
|
||||
if strings.Contains(result, ">C<") {
|
||||
t.Error("result should not contain element .c")
|
||||
}
|
||||
if !strings.Contains(result, ">B<") {
|
||||
t.Error("result should still contain element .b")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user