go-extractor/readability.go

package extractor

import (
	"bytes"
	"context"
	"fmt"
	"net/url"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/go-shiori/go-readability"
)

// ReadabilityOptions configures the readability extraction process.
type ReadabilityOptions struct {
	// RemoveSelectors is a list of CSS selectors for elements to remove from
	// the DOM before readability extraction. This is useful for stripping
	// infinite-scroll content, related articles, or other elements that
	// pollute the extracted article.
	RemoveSelectors []string
}

// Readability extracts article content from a document using the readability algorithm.
func Readability(_ context.Context, doc Document) (Article, error) {
	return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{})
}

// ReadabilityWithOptions extracts article content from a document, applying
// the provided options before extraction. Use RemoveSelectors to strip
// elements (e.g. infinite-scroll articles) from the DOM before parsing.
func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) {
	data, err := doc.Content()
	if err != nil {
		return Article{}, err
	}

	u, err := url.Parse(doc.URL())
	if err != nil {
		return Article{}, err
	}

	if len(opts.RemoveSelectors) > 0 {
		data, err = removeSelectors(data, opts.RemoveSelectors)
		if err != nil {
			return Article{}, fmt.Errorf("failed to clean DOM: %w", err)
		}
	}

	a, err := readability.FromReader(bytes.NewBufferString(data), u)
	if err != nil {
		return Article{}, err
	}

	pubTime := ""

	if a.PublishedTime != nil {
		pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
	}
	return Article{
		Title:         a.Title,
		Content:       a.Content,
		TextContent:   a.TextContent,
		Length:        a.Length,
		Excerpt:       a.Excerpt,
		Byline:        a.Byline,
		SiteName:      a.SiteName,
		Lang:          a.Language,
		PublishedTime: pubTime,
	}, nil
}

// removeSelectors parses HTML and removes all elements matching the given CSS selectors.
func removeSelectors(html string, selectors []string) (string, error) {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
	if err != nil {
		return "", fmt.Errorf("failed to parse HTML: %w", err)
	}

	for _, sel := range selectors {
		doc.Find(sel).Remove()
	}

	result, err := doc.Html()
	if err != nil {
		return "", fmt.Errorf("failed to serialize HTML: %w", err)
	}

	return result, nil
}