go-extractor/readability.go

46 lines
806 B
Go

package extractor
import (
"context"
"net/url"
"github.com/go-shiori/go-readability"
)
type Readability struct {
Extractor
}
var _ Extractor = Readability{}
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
u, err := url.Parse(src.URL())
if err != nil {
return Article{}, err
}
a, err := readability.FromReader(src.Reader(), u)
if err != nil {
return Article{}, err
}
pubTime := ""
if a.PublishedTime != nil {
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
}
return Article{
Title: a.Title,
Content: a.Content,
TextContent: a.TextContent,
Length: a.Length,
Excerpt: a.Excerpt,
Byline: a.Byline,
SiteName: a.SiteName,
Lang: a.Language,
PublishedTime: pubTime,
}, nil
}