package extractor import ( "bytes" "context" "fmt" "net/url" "strings" "github.com/PuerkitoBio/goquery" "github.com/go-shiori/go-readability" ) // ReadabilityOptions configures the readability extraction process. type ReadabilityOptions struct { // RemoveSelectors is a list of CSS selectors for elements to remove from // the DOM before readability extraction. This is useful for stripping // infinite-scroll content, related articles, or other elements that // pollute the extracted article. RemoveSelectors []string } // Readability extracts article content from a document using the readability algorithm. func Readability(_ context.Context, doc Document) (Article, error) { return ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) } // ReadabilityWithOptions extracts article content from a document, applying // the provided options before extraction. Use RemoveSelectors to strip // elements (e.g. infinite-scroll articles) from the DOM before parsing. func ReadabilityWithOptions(_ context.Context, doc Document, opts ReadabilityOptions) (Article, error) { data, err := doc.Content() if err != nil { return Article{}, err } u, err := url.Parse(doc.URL()) if err != nil { return Article{}, err } if len(opts.RemoveSelectors) > 0 { data, err = removeSelectors(data, opts.RemoveSelectors) if err != nil { return Article{}, fmt.Errorf("failed to clean DOM: %w", err) } } a, err := readability.FromReader(bytes.NewBufferString(data), u) if err != nil { return Article{}, err } pubTime := "" if a.PublishedTime != nil { pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z") } return Article{ Title: a.Title, Content: a.Content, TextContent: a.TextContent, Length: a.Length, Excerpt: a.Excerpt, Byline: a.Byline, SiteName: a.SiteName, Lang: a.Language, PublishedTime: pubTime, }, nil } // removeSelectors parses HTML and removes all elements matching the given CSS selectors. func removeSelectors(html string, selectors []string) (string, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return "", fmt.Errorf("failed to parse HTML: %w", err) } for _, sel := range selectors { doc.Find(sel).Remove() } result, err := doc.Html() if err != nil { return "", fmt.Errorf("failed to serialize HTML: %w", err) } return result, nil }