When RemoveHidden is true, JavaScript is evaluated on the live page to remove all elements with computed display:none before readability extraction. This defends against anti-scraping honeypots that embed prompt injections in hidden DOM elements. The implementation uses an optional pageEvaluator interface so that the concrete document (backed by Playwright) supports it while the Document interface remains unchanged. Closes #62 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
129 lines
3.9 KiB
Go
129 lines
3.9 KiB
Go
package extractor
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/go-shiori/go-readability"
|
|
)
|
|
|
|
// ReadabilityOptions configures the readability extraction process.
|
|
type ReadabilityOptions struct {
|
|
// RemoveSelectors is a list of CSS selectors for elements to remove from
|
|
// the DOM before readability extraction. This is useful for stripping
|
|
// infinite-scroll content, related articles, or other elements that
|
|
// pollute the extracted article.
|
|
RemoveSelectors []string
|
|
|
|
// RemoveHidden, when true, evaluates JavaScript on the live page to remove
|
|
// all elements whose computed display is "none" before extracting content.
|
|
// This is useful for stripping anti-scraping honeypots that hide prompt
|
|
// injections in invisible DOM elements.
|
|
//
|
|
// Note: this modifies the live page DOM. The Document must support
|
|
// page-level JavaScript evaluation (the concrete document type returned
|
|
// by Browser.Open does). If the Document does not support evaluation,
|
|
// an error is returned.
|
|
RemoveHidden bool
|
|
}
|
|
|
|
// pageEvaluator is an optional interface that Document implementations can
|
|
// satisfy to support page-level JavaScript evaluation.
|
|
type pageEvaluator interface {
|
|
PageEvaluate(expression string) (interface{}, error)
|
|
}
|
|
|
|
// removeHiddenJS is the JavaScript snippet that removes all elements with
|
|
// computed display:none from the DOM.
|
|
const removeHiddenJS = `() => {
|
|
document.querySelectorAll('*').forEach(el => {
|
|
if (el.isConnected && window.getComputedStyle(el).display === 'none') {
|
|
el.remove();
|
|
}
|
|
});
|
|
}`
|
|
|
|
// Readability extracts article content from a document using the readability algorithm.
|
|
func Readability(_ context.Context, doc Document) (Article, error) {
|
|
return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
|
|
}
|
|
|
|
// ReadabilityWithOptions extracts article content from a document, applying
|
|
// the provided options before extraction. Use RemoveSelectors to strip
|
|
// elements (e.g. infinite-scroll articles) from the DOM before parsing.
|
|
func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
|
|
// RemoveHidden must run on the live page before we snapshot the HTML,
|
|
// because computed styles are only available via JavaScript.
|
|
if opts.RemoveHidden {
|
|
pe, ok := doc.(pageEvaluator)
|
|
if !ok {
|
|
return Article{}, fmt.Errorf("RemoveHidden requires a Document that supports page-level JavaScript evaluation")
|
|
}
|
|
if _, err := pe.PageEvaluate(removeHiddenJS); err != nil {
|
|
return Article{}, fmt.Errorf("failed to remove hidden elements: %w", err)
|
|
}
|
|
}
|
|
|
|
data, err := doc.Content()
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
u, err := url.Parse(doc.URL())
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
if len(opts.RemoveSelectors) > 0 {
|
|
data, err = removeSelectors(data, opts.RemoveSelectors)
|
|
if err != nil {
|
|
return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
|
|
}
|
|
}
|
|
|
|
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
pubTime := ""
|
|
|
|
if a.PublishedTime != nil {
|
|
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
|
|
}
|
|
return Article{
|
|
Title: a.Title,
|
|
Content: a.Content,
|
|
TextContent: a.TextContent,
|
|
Length: a.Length,
|
|
Excerpt: a.Excerpt,
|
|
Byline: a.Byline,
|
|
SiteName: a.SiteName,
|
|
Lang: a.Language,
|
|
PublishedTime: pubTime,
|
|
}, nil
|
|
}
|
|
|
|
// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
|
|
func removeSelectors(html string, selectors []string) (string, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to parse HTML: %w", err)
|
|
}
|
|
|
|
for _, sel := range selectors {
|
|
doc.Find(sel).Remove()
|
|
}
|
|
|
|
result, err := doc.Html()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to serialize HTML: %w", err)
|
|
}
|
|
|
|
return result, nil
|
|
}
|