88 lines
1.6 KiB
Go
88 lines
1.6 KiB
Go
package extractor
|
|
|
|
import (
|
|
"answer/pkg/cache"
|
|
"context"
|
|
"errors"
|
|
)
|
|
|
|
var ErrFailedToExtract = errors.New("failed to extract")
|
|
|
|
type Article struct {
|
|
URL string
|
|
Title string
|
|
Body string
|
|
}
|
|
|
|
// Extractor is an interface of systems that can extract the contents of
|
|
type Extractor interface {
|
|
Extract(ctx context.Context, url string) (Article, error)
|
|
}
|
|
|
|
type multiExtractor struct {
|
|
extractors []Extractor
|
|
}
|
|
|
|
var _ Extractor = multiExtractor{}
|
|
|
|
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
|
|
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
|
var errs []error
|
|
for _, e := range m.extractors {
|
|
article, err := e.Extract(ctx, url)
|
|
if err == nil {
|
|
return article, nil
|
|
}
|
|
|
|
if errors.Is(err, ErrFailedToExtract) {
|
|
continue
|
|
}
|
|
|
|
errs = append(errs, err)
|
|
}
|
|
|
|
if len(errs) > 0 {
|
|
return Article{}, errors.Join(errs...)
|
|
}
|
|
return Article{}, ErrFailedToExtract
|
|
}
|
|
|
|
func MultiExtractor(e ...Extractor) Extractor {
|
|
return multiExtractor{extractors: e}
|
|
}
|
|
|
|
type CacheExtractor struct {
|
|
Cache cache.Cache
|
|
Tag string
|
|
Extractor Extractor
|
|
}
|
|
|
|
var _ Extractor = CacheExtractor{}
|
|
|
|
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
|
tag := c.Tag
|
|
if tag == "" {
|
|
tag = "defaultextractor:"
|
|
}
|
|
key := tag + ":" + url
|
|
|
|
var article Article
|
|
|
|
err := c.Cache.GetJSON(key, &article)
|
|
if err == nil {
|
|
return article, nil
|
|
}
|
|
|
|
article, err = c.Extractor.Extract(ctx, url)
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
err = c.Cache.SetJSON(key, article)
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
return article, nil
|
|
}
|