diff --git a/go.mod b/go.mod index 3f0f183..f17b743 100644 --- a/go.mod +++ b/go.mod @@ -8,10 +8,11 @@ require ( github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 github.com/playwright-community/playwright-go v0.5200.0 github.com/urfave/cli/v3 v3.0.0-beta1 - golang.org/x/text v0.29.0 + golang.org/x/text v0.31.0 ) require ( + github.com/PuerkitoBio/goquery v1.11.0 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect @@ -19,5 +20,5 @@ require ( github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect - golang.org/x/net v0.44.0 // indirect + golang.org/x/net v0.47.0 // indirect ) diff --git a/go.sum b/go.sum index 3cbc801..de29e4b 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= @@ -59,6 +61,8 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -97,6 +101,8 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= diff --git a/readability.go b/readability.go index 74aff38..933dac4 100644 --- a/readability.go +++ b/readability.go @@ -3,25 +3,50 @@ package extractor import ( "bytes" "context" + "fmt" "net/url" + "strings" + "github.com/PuerkitoBio/goquery" "github.com/go-shiori/go-readability" ) +// ReadabilityOptions configures the readability extraction process. +type ReadabilityOptions struct { + // RemoveSelectors is a list of CSS selectors for elements to remove from + // the DOM before readability extraction. This is useful for stripping + // infinite-scroll content, related articles, or other elements that + // pollute the extracted article. + RemoveSelectors []string +} + +// Readability extracts article content from a document using the readability algorithm. func Readability(_ context.Context, doc Document) (Article, error) { + return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) +} + +// ReadabilityWithOptions extracts article content from a document, applying +// the provided options before extraction. Use RemoveSelectors to strip +// elements (e.g. infinite-scroll articles) from the DOM before parsing. +func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) { data, err := doc.Content() if err != nil { return Article{}, err } u, err := url.Parse(doc.URL()) - if err != nil { return Article{}, err } - a, err := readability.FromReader(bytes.NewBufferString(data), u) + if len(opts.RemoveSelectors) > 0 { + data, err = removeSelectors(data, opts.RemoveSelectors) + if err != nil { + return Article{}, fmt.Errorf("failed to clean DOM: %w", err) + } + } + a, err := readability.FromReader(bytes.NewBufferString(data), u) if err != nil { return Article{}, err } @@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) { Lang: a.Language, PublishedTime: pubTime, }, nil - +} + +// removeSelectors parses HTML and removes all elements matching the given CSS selectors. +func removeSelectors(html string, selectors []string) (string, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return "", fmt.Errorf("failed to parse HTML: %w", err) + } + + for _, sel := range selectors { + doc.Find(sel).Remove() + } + + result, err := doc.Html() + if err != nil { + return "", fmt.Errorf("failed to serialize HTML: %w", err) + } + + return result, nil } diff --git a/readability_test.go b/readability_test.go index 6033bfc..a5ca55f 100644 --- a/readability_test.go +++ b/readability_test.go @@ -2,6 +2,7 @@ package extractor import ( "context" + "strings" "testing" ) @@ -70,3 +71,135 @@ func TestReadability_InvalidURL(t *testing.T) { t.Error("Readability() expected error for invalid URL, got nil") } } + +func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) { + html := ` + +Main Article + +
+

Main Article

+

This is the main article content that we want to extract properly. +It contains several sentences about the main topic of interest. The +readability algorithm should pick this up as the primary content of +the page without any interference from other elements.

+

Here is a second paragraph with more relevant content about the +main topic. This paragraph adds depth and detail to the article.

+
+
+
+

Unrelated Article

+

This is content from an unrelated article loaded via infinite scroll. +It should not appear in the extracted content because we will remove it +using the RemoveSelectors option before readability extraction.

+
+
+ + +` + + doc := mockDocument{ + url: "https://example.com/article", + content: html, + } + + article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ + RemoveSelectors: []string{".infinite-scroll", ".sidebar"}, + }) + if err != nil { + t.Fatalf("ReadabilityWithOptions() error = %v", err) + } + + if article.TextContent == "" { + t.Fatal("TextContent should not be empty") + } + + if strings.Contains(article.TextContent, "Unrelated Article") { + t.Error("TextContent should not contain content from removed .infinite-scroll element") + } + + if strings.Contains(article.TextContent, "Sidebar content") { + t.Error("TextContent should not contain content from removed .sidebar element") + } + + if !strings.Contains(article.TextContent, "main article content") { + t.Error("TextContent should still contain the main article content") + } +} + +func TestReadabilityWithOptions_NoSelectors(t *testing.T) { + html := ` + +Test Article + +
+

Test Article

+

This is a test article with enough content to be parsed by readability. +It needs to have a reasonable amount of text so the algorithm considers it +a valid article. Let us add several sentences to make sure this works +correctly. The readability library requires a minimum amount of content +to successfully extract an article from a page.

+

Here is another paragraph to add more content. We want to make sure +that the content is substantial enough for the readability algorithm to +consider this a valid article and extract the text properly.

+
+ +` + + doc := mockDocument{ + url: "https://example.com/article", + content: html, + } + + // With empty options, should behave identically to Readability(). + article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) + if err != nil { + t.Fatalf("ReadabilityWithOptions() error = %v", err) + } + + if article.Title != "Test Article" { + t.Errorf("Title = %q, want %q", article.Title, "Test Article") + } + + if article.TextContent == "" { + t.Error("TextContent should not be empty") + } +} + +func TestRemoveSelectors(t *testing.T) { + html := `
Keep this
Remove this
` + + result, err := removeSelectors(html, []string{".remove"}) + if err != nil { + t.Fatalf("removeSelectors() error = %v", err) + } + + if strings.Contains(result, "Remove this") { + t.Error("result should not contain removed element content") + } + + if !strings.Contains(result, "Keep this") { + t.Error("result should still contain kept element content") + } +} + +func TestRemoveSelectors_MultipleSelectors(t *testing.T) { + html := `
A
B
C
` + + result, err := removeSelectors(html, []string{".a", ".c"}) + if err != nil { + t.Fatalf("removeSelectors() error = %v", err) + } + + if strings.Contains(result, ">A<") { + t.Error("result should not contain element .a") + } + if strings.Contains(result, ">C<") { + t.Error("result should not contain element .c") + } + if !strings.Contains(result, ">B<") { + t.Error("result should still contain element .b") + } +}