2024-12-07 03:53:46 -05:00
|
|
|
package extractor
|
|
|
|
|
|
|
|
import (
|
2024-12-23 03:18:50 -05:00
|
|
|
"bytes"
|
2024-12-07 03:53:46 -05:00
|
|
|
"context"
|
|
|
|
"net/url"
|
|
|
|
|
|
|
|
"github.com/go-shiori/go-readability"
|
|
|
|
)
|
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
func Readability(_ context.Context, doc Document) (Article, error) {
|
|
|
|
data, err := doc.Content()
|
|
|
|
if err != nil {
|
|
|
|
return Article{}, err
|
|
|
|
}
|
2024-12-07 03:53:46 -05:00
|
|
|
|
2024-12-23 03:18:50 -05:00
|
|
|
u, err := url.Parse(doc.URL())
|
2024-12-07 03:53:46 -05:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return Article{}, err
|
|
|
|
}
|
2024-12-23 03:18:50 -05:00
|
|
|
|
|
|
|
a, err := readability.FromReader(bytes.NewBufferString(data), u)
|
2024-12-07 03:53:46 -05:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return Article{}, err
|
|
|
|
}
|
|
|
|
|
|
|
|
pubTime := ""
|
|
|
|
|
|
|
|
if a.PublishedTime != nil {
|
|
|
|
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
|
|
|
|
}
|
|
|
|
return Article{
|
|
|
|
Title: a.Title,
|
|
|
|
Content: a.Content,
|
|
|
|
TextContent: a.TextContent,
|
|
|
|
Length: a.Length,
|
|
|
|
Excerpt: a.Excerpt,
|
|
|
|
Byline: a.Byline,
|
|
|
|
SiteName: a.SiteName,
|
|
|
|
Lang: a.Language,
|
|
|
|
PublishedTime: pubTime,
|
|
|
|
}, nil
|
|
|
|
|
|
|
|
}
|