46 lines
806 B
Go
46 lines
806 B
Go
|
package extractor
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"net/url"
|
||
|
|
||
|
"github.com/go-shiori/go-readability"
|
||
|
)
|
||
|
|
||
|
type Readability struct {
|
||
|
Extractor
|
||
|
}
|
||
|
|
||
|
var _ Extractor = Readability{}
|
||
|
|
||
|
func (r Readability) Extract(_ context.Context, src Source) (Article, error) {
|
||
|
u, err := url.Parse(src.URL())
|
||
|
|
||
|
if err != nil {
|
||
|
return Article{}, err
|
||
|
}
|
||
|
a, err := readability.FromReader(src.Reader(), u)
|
||
|
|
||
|
if err != nil {
|
||
|
return Article{}, err
|
||
|
}
|
||
|
|
||
|
pubTime := ""
|
||
|
|
||
|
if a.PublishedTime != nil {
|
||
|
pubTime = a.PublishedTime.Format("2006-01-02T15:04:05Z")
|
||
|
}
|
||
|
return Article{
|
||
|
Title: a.Title,
|
||
|
Content: a.Content,
|
||
|
TextContent: a.TextContent,
|
||
|
Length: a.Length,
|
||
|
Excerpt: a.Excerpt,
|
||
|
Byline: a.Byline,
|
||
|
SiteName: a.SiteName,
|
||
|
Lang: a.Language,
|
||
|
PublishedTime: pubTime,
|
||
|
}, nil
|
||
|
|
||
|
}
|