go-extractor/readability.go

47 lines
821 B
Go
Raw Permalink Normal View History

2024-12-07 03:53:46 -05:00
package extractor
import (
"bytes"
2024-12-07 03:53:46 -05:00
"context"
"net/url"
"github.com/go-shiori/go-readability"
)
func Readability(_ context.Context, doc Document) (Article, error) {
data, err := doc.Content()
if err != nil {
return Article{}, err
}
2024-12-07 03:53:46 -05:00
u, err := url.Parse(doc.URL())
2024-12-07 03:53:46 -05:00
if err != nil {
return Article{}, err
}
a, err := readability.FromReader(bytes.NewBufferString(data), u)
2024-12-07 03:53:46 -05:00
if err != nil {
return Article{}, err
}
pubTime := ""
if a.PublishedTime != nil {
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
}
return Article{
Title: a.Title,
Content: a.Content,
TextContent: a.TextContent,
Length: a.Length,
Excerpt: a.Excerpt,
Byline: a.Byline,
SiteName: a.SiteName,
Lang: a.Language,
PublishedTime: pubTime,
}, nil
}