feat: add ReadabilityWithOptions for DOM cleanup before extraction
Sites with infinite scroll (e.g. The Verge) load additional articles into the DOM, which get included in readability extraction. Add ReadabilityOptions.RemoveSelectors to strip elements by CSS selector before parsing, avoiding the need to reimplement the readability pipeline downstream. Closes #60 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,25 +3,50 @@ package extractor
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/go-shiori/go-readability"
|
||||
)
|
||||
|
||||
// ReadabilityOptions configures the readability extraction process.
|
||||
type ReadabilityOptions struct {
|
||||
// RemoveSelectors is a list of CSS selectors for elements to remove from
|
||||
// the DOM before readability extraction. This is useful for stripping
|
||||
// infinite-scroll content, related articles, or other elements that
|
||||
// pollute the extracted article.
|
||||
RemoveSelectors []string
|
||||
}
|
||||
|
||||
// Readability extracts article content from a document using the readability algorithm.
|
||||
func Readability(_ context.Context, doc Document) (Article, error) {
|
||||
return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
|
||||
}
|
||||
|
||||
// ReadabilityWithOptions extracts article content from a document, applying
|
||||
// the provided options before extraction. Use RemoveSelectors to strip
|
||||
// elements (e.g. infinite-scroll articles) from the DOM before parsing.
|
||||
func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
|
||||
data, err := doc.Content()
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
u, err := url.Parse(doc.URL())
|
||||
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
||||
if len(opts.RemoveSelectors) > 0 {
|
||||
data, err = removeSelectors(data, opts.RemoveSelectors)
|
||||
if err != nil {
|
||||
return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
@@ -42,5 +67,23 @@ func Readability(_ context.Context, doc Document) (Article, error) {
|
||||
Lang: a.Language,
|
||||
PublishedTime: pubTime,
|
||||
}, nil
|
||||
|
||||
}
|
||||
|
||||
// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
|
||||
func removeSelectors(html string, selectors []string) (string, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
for _, sel := range selectors {
|
||||
doc.Find(sel).Remove()
|
||||
}
|
||||
|
||||
result, err := doc.Html()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to serialize HTML: %w", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user