answer/pkg/extractor/extractor.go
Steve Dudenhoeffer 37c18b3b58 Update module name to match new repository URL
Renamed the Go module to align with the updated repository path for better clarity and organization. This ensures consistency across the project and prevents potential import conflicts.
2025-02-21 18:45:27 -05:00

89 lines
1.7 KiB
Go

package extractor
import (
"context"
"errors"
"gitea.stevedudenhoeffer.com/steve/answer/pkg/cache"
)
var ErrFailedToExtract = errors.New("failed to extract")
type Article struct {
URL string
Title string
Body string
}
// Extractor is an interface of systems that can extract the contents of
type Extractor interface {
Extract(ctx context.Context, url string) (Article, error)
}
type multiExtractor struct {
extractors []Extractor
}
var _ Extractor = multiExtractor{}
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
var errs []error
for _, e := range m.extractors {
article, err := e.Extract(ctx, url)
if err == nil {
return article, nil
}
if errors.Is(err, ErrFailedToExtract) {
continue
}
errs = append(errs, err)
}
if len(errs) > 0 {
return Article{}, errors.Join(errs...)
}
return Article{}, ErrFailedToExtract
}
func MultiExtractor(e ...Extractor) Extractor {
return multiExtractor{extractors: e}
}
type CacheExtractor struct {
Cache cache.Cache
Tag string
Extractor Extractor
}
var _ Extractor = CacheExtractor{}
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
tag := c.Tag
if tag == "" {
tag = "defaultextractor:"
}
key := tag + ":" + url
var article Article
err := c.Cache.GetJSON(key, &article)
if err == nil {
return article, nil
}
article, err = c.Extractor.Extract(ctx, url)
if err != nil {
return Article{}, err
}
err = c.Cache.SetJSON(key, article)
if err != nil {
return Article{}, err
}
return article, nil
}