go-extractor/readability.go

package extractor

import (
	"bytes"
	"context"
	"fmt"
	"net/url"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/go-shiori/go-readability"
)

// ReadabilityOptions configures the readability extraction process.
type ReadabilityOptions struct {
	// RemoveSelectors is a list of CSS selectors for elements to remove from
	// the DOM before readability extraction. This is useful for stripping
	// infinite-scroll content, related articles, or other elements that
	// pollute the extracted article.
	RemoveSelectors []string

	// RemoveHidden, when true, evaluates JavaScript on the live page to remove
	// all elements whose computed display is "none" before extracting content.
	// This is useful for stripping anti-scraping honeypots that hide prompt
	// injections in invisible DOM elements.
	//
	// Note: this modifies the live page DOM. The Document must support
	// page-level JavaScript evaluation (the concrete document type returned
	// by Browser.Open does). If the Document does not support evaluation,
	// an error is returned.
	RemoveHidden bool
}

// pageEvaluator is an optional interface that Document implementations can
// satisfy to support page-level JavaScript evaluation.
type pageEvaluator interface {
	PageEvaluate(expression string) (interface{}, error)
}

// removeHiddenJS is the JavaScript snippet that removes all elements with
// computed display:none from the DOM.
const removeHiddenJS = `() => {
	document.querySelectorAll('*').forEach(el => {
		if (el.isConnected && window.getComputedStyle(el).display === 'none') {
			el.remove();
		}
	});
}`

// Readability extracts article content from a document using the readability algorithm.
func Readability(_ context.Context, doc Document) (Article, error) {
	return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
}

// ReadabilityWithOptions extracts article content from a document, applying
// the provided options before extraction. Use RemoveSelectors to strip
// elements (e.g. infinite-scroll articles) from the DOM before parsing.
func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
	// RemoveHidden must run on the live page before we snapshot the HTML,
	// because computed styles are only available via JavaScript.
	if opts.RemoveHidden {
		pe, ok := doc.(pageEvaluator)
		if !ok {
			return Article{}, fmt.Errorf("RemoveHidden requires a Document that supports page-level JavaScript evaluation")
		}
		if _, err := pe.PageEvaluate(removeHiddenJS); err != nil {
			return Article{}, fmt.Errorf("failed to remove hidden elements: %w", err)
		}
	}

	data, err := doc.Content()
	if err != nil {
		return Article{}, err
	}

	u, err := url.Parse(doc.URL())
	if err != nil {
		return Article{}, err
	}

	if len(opts.RemoveSelectors) > 0 {
		data, err = removeSelectors(data, opts.RemoveSelectors)
		if err != nil {
			return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
		}
	}

	a, err := readability.FromReader(bytes.NewBufferString(data), u)
	if err != nil {
		return Article{}, err
	}

	pubTime := ""

	if a.PublishedTime != nil {
		pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
	}
	return Article{
		Title:         a.Title,
		Content:       a.Content,
		TextContent:   a.TextContent,
		Length:        a.Length,
		Excerpt:       a.Excerpt,
		Byline:        a.Byline,
		SiteName:      a.SiteName,
		Lang:          a.Language,
		PublishedTime: pubTime,
	}, nil
}

// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
func removeSelectors(html string, selectors []string) (string, error) {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
	if err != nil {
		return "", fmt.Errorf("failed to parse HTML: %w", err)
	}

	for _, sel := range selectors {
		doc.Find(sel).Remove()
	}

	result, err := doc.Html()
	if err != nil {
		return "", fmt.Errorf("failed to serialize HTML: %w", err)
	}

	return result, nil
}