diff --git a/document.go b/document.go index c0089fd..532758c 100644 --- a/document.go +++ b/document.go @@ -68,6 +68,10 @@ func (d *document) Refresh() error { return nil } +func (d *document) PageEvaluate(expression string) (interface{}, error) { + return d.page.Evaluate(expression) +} + func (d *document) WaitForNetworkIdle(timeout *time.Duration) error { if timeout == nil { t := 30 * time.Second diff --git a/readability.go b/readability.go index 933dac4..43c1f0f 100644 --- a/readability.go +++ b/readability.go @@ -18,8 +18,35 @@ type ReadabilityOptions struct { // infinite-scroll content, related articles, or other elements that // pollute the extracted article. RemoveSelectors []string + + // RemoveHidden, when true, evaluates JavaScript on the live page to remove + // all elements whose computed display is "none" before extracting content. + // This is useful for stripping anti-scraping honeypots that hide prompt + // injections in invisible DOM elements. + // + // Note: this modifies the live page DOM. The Document must support + // page-level JavaScript evaluation (the concrete document type returned + // by Browser.Open does). If the Document does not support evaluation, + // an error is returned. + RemoveHidden bool } +// pageEvaluator is an optional interface that Document implementations can +// satisfy to support page-level JavaScript evaluation. +type pageEvaluator interface { + PageEvaluate(expression string) (interface{}, error) +} + +// removeHiddenJS is the JavaScript snippet that removes all elements with +// computed display:none from the DOM. +const removeHiddenJS = `() => { + document.querySelectorAll('*').forEach(el => { + if (el.isConnected && window.getComputedStyle(el).display === 'none') { + el.remove(); + } + }); +}` + // Readability extracts article content from a document using the readability algorithm. func Readability(_ context.Context, doc Document) (Article, error) { return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) @@ -29,6 +56,18 @@ func Readability(_ context.Context, doc Document) (Article, error) { // the provided options before extraction. Use RemoveSelectors to strip // elements (e.g. infinite-scroll articles) from the DOM before parsing. func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) { + // RemoveHidden must run on the live page before we snapshot the HTML, + // because computed styles are only available via JavaScript. + if opts.RemoveHidden { + pe, ok := doc.(pageEvaluator) + if !ok { + return Article{}, fmt.Errorf("RemoveHidden requires a Document that supports page-level JavaScript evaluation") + } + if _, err := pe.PageEvaluate(removeHiddenJS); err != nil { + return Article{}, fmt.Errorf("failed to remove hidden elements: %w", err) + } + } + data, err := doc.Content() if err != nil { return Article{}, err diff --git a/readability_test.go b/readability_test.go index a5ca55f..889acb2 100644 --- a/readability_test.go +++ b/readability_test.go @@ -2,6 +2,7 @@ package extractor import ( "context" + "fmt" "strings" "testing" ) @@ -203,3 +204,123 @@ func TestRemoveSelectors_MultipleSelectors(t *testing.T) { t.Error("result should still contain element .b") } } + +// mockPageEvalDocument is a mock Document that supports PageEvaluate. +// The evaluateFn callback simulates JavaScript evaluation by allowing +// tests to mutate the document's content field. +type mockPageEvalDocument struct { + mockDocument + evaluateFn func(expression string) (interface{}, error) +} + +func (m *mockPageEvalDocument) PageEvaluate(expression string) (interface{}, error) { + if m.evaluateFn != nil { + return m.evaluateFn(expression) + } + return nil, nil +} + +func TestReadabilityWithOptions_RemoveHidden(t *testing.T) { + htmlBefore := ` + +Article With Hidden Honeypot + +
+

Real Article

+

This is the real article content that should be extracted. It contains +several sentences about a real topic. The readability algorithm should +pick this up as the primary content of the page.

+

Here is another paragraph with more real content to make the article +substantial enough for readability extraction to work properly.

+ +
+ +` + + // After JS evaluation removes display:none elements, the content + // should no longer contain the honeypot div. + htmlAfter := ` + +Article With Hidden Honeypot + +
+

Real Article

+

This is the real article content that should be extracted. It contains +several sentences about a real topic. The readability algorithm should +pick this up as the primary content of the page.

+

Here is another paragraph with more real content to make the article +substantial enough for readability extraction to work properly.

+
+ +` + + doc := &mockPageEvalDocument{ + mockDocument: mockDocument{ + url: "https://example.com/article", + content: htmlBefore, + }, + } + + doc.evaluateFn = func(expression string) (interface{}, error) { + // Simulate the JS removing hidden elements by swapping content. + doc.content = htmlAfter + return nil, nil + } + + article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ + RemoveHidden: true, + }) + if err != nil { + t.Fatalf("ReadabilityWithOptions() error = %v", err) + } + + if strings.Contains(article.TextContent, "step-by-step") { + t.Error("TextContent should not contain hidden honeypot content") + } + + if !strings.Contains(article.TextContent, "real article content") { + t.Error("TextContent should still contain the real article content") + } +} + +func TestReadabilityWithOptions_RemoveHidden_EvaluateError(t *testing.T) { + doc := &mockPageEvalDocument{ + mockDocument: mockDocument{ + url: "https://example.com/article", + content: "

text

", + }, + evaluateFn: func(expression string) (interface{}, error) { + return nil, fmt.Errorf("JS evaluation failed") + }, + } + + _, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ + RemoveHidden: true, + }) + if err == nil { + t.Fatal("expected error when PageEvaluate fails, got nil") + } + if !strings.Contains(err.Error(), "failed to remove hidden elements") { + t.Errorf("error = %q, want it to contain %q", err.Error(), "failed to remove hidden elements") + } +} + +func TestReadabilityWithOptions_RemoveHidden_UnsupportedDocument(t *testing.T) { + // A plain mockDocument does not implement pageEvaluator. + doc := mockDocument{ + url: "https://example.com/article", + content: "

text

", + } + + _, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ + RemoveHidden: true, + }) + if err == nil { + t.Fatal("expected error when Document does not support PageEvaluate, got nil") + } + if !strings.Contains(err.Error(), "RemoveHidden requires") { + t.Errorf("error = %q, want it to contain %q", err.Error(), "RemoveHidden requires") + } +}