package extractor import ( "bytes" "context" "fmt" "net/url" "strings" "github.com/PuerkitoBio/goquery" "github.com/go-shiori/go-readability" ) // ReadabilityOptions configures the readability extraction process. type ReadabilityOptions struct { // RemoveSelectors is a list of CSS selectors for elements to remove from // the DOM before readability extraction. This is useful for stripping // infinite-scroll content, related articles, or other elements that // pollute the extracted article. RemoveSelectors []string // RemoveHidden, when true, evaluates JavaScript on the live page to remove // all elements whose computed display is "none" before extracting content. // This is useful for stripping anti-scraping honeypots that hide prompt // injections in invisible DOM elements. // // Note: this modifies the live page DOM. The Document must support // page-level JavaScript evaluation (the concrete document type returned // by Browser.Open does). If the Document does not support evaluation, // an error is returned. RemoveHidden bool } // pageEvaluator is an optional interface that Document implementations can // satisfy to support page-level JavaScript evaluation. type pageEvaluator interface { PageEvaluate(expression string) (interface{}, error) } // removeHiddenJS is the JavaScript snippet that removes all elements with // computed display:none from the DOM. const removeHiddenJS = `() => { document.querySelectorAll('*').forEach(el => { if (el.isConnected && window.getComputedStyle(el).display === 'none') { el.remove(); } }); }` // Readability extracts article content from a document using the readability algorithm. func Readability(_ context.Context, doc Document) (Article, error) { return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) } // ReadabilityWithOptions extracts article content from a document, applying // the provided options before extraction. Use RemoveSelectors to strip // elements (e.g. infinite-scroll articles) from the DOM before parsing. func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) { // RemoveHidden must run on the live page before we snapshot the HTML, // because computed styles are only available via JavaScript. if opts.RemoveHidden { pe, ok := doc.(pageEvaluator) if !ok { return Article{}, fmt.Errorf("RemoveHidden requires a Document that supports page-level JavaScript evaluation") } if _, err := pe.PageEvaluate(removeHiddenJS); err != nil { return Article{}, fmt.Errorf("failed to remove hidden elements: %w", err) } } data, err := doc.Content() if err != nil { return Article{}, err } u, err := url.Parse(doc.URL()) if err != nil { return Article{}, err } if len(opts.RemoveSelectors) > 0 { data, err = removeSelectors(data, opts.RemoveSelectors) if err != nil { return Article{}, fmt.Errorf("failed to clean DOM: %w", err) } } a, err := readability.FromReader(bytes.NewBufferString(data), u) if err != nil { return Article{}, err } pubTime := "" if a.PublishedTime != nil { pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z") } return Article{ Title: a.Title, Content: a.Content, TextContent: a.TextContent, Length: a.Length, Excerpt: a.Excerpt, Byline: a.Byline, SiteName: a.SiteName, Lang: a.Language, PublishedTime: pubTime, }, nil } // removeSelectors parses HTML and removes all elements matching the given CSS selectors. func removeSelectors(html string, selectors []string) (string, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return "", fmt.Errorf("failed to parse HTML: %w", err) } for _, sel := range selectors { doc.Find(sel).Remove() } result, err := doc.Html() if err != nil { return "", fmt.Errorf("failed to serialize HTML: %w", err) } return result, nil }