Renamed the Go module to align with the updated repository path for better clarity and organization. This ensures consistency across the project and prevents potential import conflicts.
89 lines
1.7 KiB
Go
89 lines
1.7 KiB
Go
package extractor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/answer/pkg/cache"
|
|
)
|
|
|
|
var ErrFailedToExtract = errors.New("failed to extract")
|
|
|
|
type Article struct {
|
|
URL string
|
|
Title string
|
|
Body string
|
|
}
|
|
|
|
// Extractor is an interface of systems that can extract the contents of
|
|
type Extractor interface {
|
|
Extract(ctx context.Context, url string) (Article, error)
|
|
}
|
|
|
|
type multiExtractor struct {
|
|
extractors []Extractor
|
|
}
|
|
|
|
var _ Extractor = multiExtractor{}
|
|
|
|
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
|
|
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
|
var errs []error
|
|
for _, e := range m.extractors {
|
|
article, err := e.Extract(ctx, url)
|
|
if err == nil {
|
|
return article, nil
|
|
}
|
|
|
|
if errors.Is(err, ErrFailedToExtract) {
|
|
continue
|
|
}
|
|
|
|
errs = append(errs, err)
|
|
}
|
|
|
|
if len(errs) > 0 {
|
|
return Article{}, errors.Join(errs...)
|
|
}
|
|
return Article{}, ErrFailedToExtract
|
|
}
|
|
|
|
func MultiExtractor(e ...Extractor) Extractor {
|
|
return multiExtractor{extractors: e}
|
|
}
|
|
|
|
type CacheExtractor struct {
|
|
Cache cache.Cache
|
|
Tag string
|
|
Extractor Extractor
|
|
}
|
|
|
|
var _ Extractor = CacheExtractor{}
|
|
|
|
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
|
|
tag := c.Tag
|
|
if tag == "" {
|
|
tag = "defaultextractor:"
|
|
}
|
|
key := tag + ":" + url
|
|
|
|
var article Article
|
|
|
|
err := c.Cache.GetJSON(key, &article)
|
|
if err == nil {
|
|
return article, nil
|
|
}
|
|
|
|
article, err = c.Extractor.Extract(ctx, url)
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
err = c.Cache.SetJSON(key, article)
|
|
if err != nil {
|
|
return Article{}, err
|
|
}
|
|
|
|
return article, nil
|
|
}
|