feature: add recipe extractor with JSON-LD and DOM parsing
Add sites/recipe package with ExtractRecipe() that works on any recipe URL. Parses JSON-LD structured data (@type: Recipe) first, with DOM fallback. Handles @graph containers, arrays, HowToStep objects, ISO 8601 durations, and various author/yield/image formats. Closes #29 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
374
sites/recipe/recipe.go
Normal file
374
sites/recipe/recipe.go
Normal file
@@ -0,0 +1,374 @@
|
||||
package recipe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
// Recipe holds structured recipe data extracted from a web page.
|
||||
type Recipe struct {
|
||||
Name string
|
||||
Description string
|
||||
Author string
|
||||
PrepTime string
|
||||
CookTime string
|
||||
TotalTime string
|
||||
Yield string // servings
|
||||
Ingredients []string
|
||||
Instructions []string
|
||||
ImageURL string
|
||||
Rating float64
|
||||
Calories string
|
||||
SourceURL string
|
||||
}
|
||||
|
||||
// Config holds configuration for the recipe extractor.
|
||||
type Config struct{}
|
||||
|
||||
// DefaultConfig is the default recipe configuration.
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func (c Config) validate() Config {
|
||||
return c
|
||||
}
|
||||
|
||||
// ExtractRecipe extracts structured recipe data from any URL.
|
||||
// Uses JSON-LD structured data when available, falls back to DOM parsing.
|
||||
func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
|
||||
c = c.validate()
|
||||
|
||||
slog.Info("fetching recipe", "url", url)
|
||||
doc, err := b.Open(ctx, url, extractor.OpenPageOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open recipe page: %w", err)
|
||||
}
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
timeout := 10 * time.Second
|
||||
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
|
||||
slog.Warn("WaitForNetworkIdle failed", "err", err)
|
||||
}
|
||||
|
||||
r, err := extractRecipeFromJSONLD(doc)
|
||||
if err == nil && r.Name != "" {
|
||||
r.SourceURL = url
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// Fall back to DOM parsing
|
||||
r = extractRecipeFromDOM(doc)
|
||||
r.SourceURL = url
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ExtractRecipe is a convenience function using DefaultConfig.
|
||||
func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
|
||||
return DefaultConfig.ExtractRecipe(ctx, b, url)
|
||||
}
|
||||
|
||||
// jsonLDGraph represents a JSON-LD @graph container.
|
||||
type jsonLDGraph struct {
|
||||
Graph []json.RawMessage `json:"@graph"`
|
||||
}
|
||||
|
||||
// jsonLDRecipe represents the JSON-LD Recipe schema fields we extract.
|
||||
type jsonLDRecipe struct {
|
||||
Type interface{} `json:"@type"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Author interface{} `json:"author"`
|
||||
PrepTime string `json:"prepTime"`
|
||||
CookTime string `json:"cookTime"`
|
||||
TotalTime string `json:"totalTime"`
|
||||
Yield interface{} `json:"recipeYield"`
|
||||
Ingredients []string `json:"recipeIngredient"`
|
||||
Instructions interface{} `json:"recipeInstructions"`
|
||||
Image interface{} `json:"image"`
|
||||
Nutrition *struct {
|
||||
Calories string `json:"calories"`
|
||||
} `json:"nutrition"`
|
||||
Rating *struct {
|
||||
RatingValue interface{} `json:"ratingValue"`
|
||||
} `json:"aggregateRating"`
|
||||
}
|
||||
|
||||
func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) {
|
||||
var recipe Recipe
|
||||
|
||||
scripts := doc.Select("script[type='application/ld+json']")
|
||||
if len(scripts) == 0 {
|
||||
return nil, fmt.Errorf("no JSON-LD scripts found")
|
||||
}
|
||||
|
||||
for _, script := range scripts {
|
||||
txt, err := script.Text()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
txt = strings.TrimSpace(txt)
|
||||
|
||||
r, err := parseJSONLDRecipe(txt)
|
||||
if err == nil && r.Name != "" {
|
||||
return r, nil
|
||||
}
|
||||
}
|
||||
|
||||
return &recipe, fmt.Errorf("no Recipe JSON-LD found")
|
||||
}
|
||||
|
||||
func parseJSONLDRecipe(raw string) (*Recipe, error) {
|
||||
// Try direct Recipe object
|
||||
var jr jsonLDRecipe
|
||||
if err := json.Unmarshal([]byte(raw), &jr); err == nil {
|
||||
if isRecipeType(jr.Type) {
|
||||
return convertJSONLDRecipe(&jr), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Try @graph array
|
||||
var graph jsonLDGraph
|
||||
if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 {
|
||||
for _, item := range graph.Graph {
|
||||
var jr jsonLDRecipe
|
||||
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
|
||||
return convertJSONLDRecipe(&jr), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try array of objects
|
||||
var arr []json.RawMessage
|
||||
if err := json.Unmarshal([]byte(raw), &arr); err == nil {
|
||||
for _, item := range arr {
|
||||
var jr jsonLDRecipe
|
||||
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
|
||||
return convertJSONLDRecipe(&jr), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("no Recipe type in JSON-LD")
|
||||
}
|
||||
|
||||
func isRecipeType(t interface{}) bool {
|
||||
switch v := t.(type) {
|
||||
case string:
|
||||
return v == "Recipe"
|
||||
case []interface{}:
|
||||
for _, item := range v {
|
||||
if s, ok := item.(string); ok && s == "Recipe" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe {
|
||||
r := &Recipe{
|
||||
Name: jr.Name,
|
||||
Description: jr.Description,
|
||||
PrepTime: formatDuration(jr.PrepTime),
|
||||
CookTime: formatDuration(jr.CookTime),
|
||||
TotalTime: formatDuration(jr.TotalTime),
|
||||
Ingredients: jr.Ingredients,
|
||||
}
|
||||
|
||||
// Author can be string or object
|
||||
r.Author = extractAuthor(jr.Author)
|
||||
|
||||
// Yield can be string or array
|
||||
r.Yield = extractYield(jr.Yield)
|
||||
|
||||
// Instructions can be string, array of strings, or array of HowToStep objects
|
||||
r.Instructions = extractInstructions(jr.Instructions)
|
||||
|
||||
// Image can be string or object or array
|
||||
r.ImageURL = extractImage(jr.Image)
|
||||
|
||||
// Nutrition
|
||||
if jr.Nutrition != nil {
|
||||
r.Calories = jr.Nutrition.Calories
|
||||
}
|
||||
|
||||
// Rating
|
||||
if jr.Rating != nil {
|
||||
r.Rating = extractFloat(jr.Rating.RatingValue)
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func extractAuthor(v interface{}) string {
|
||||
switch a := v.(type) {
|
||||
case string:
|
||||
return a
|
||||
case map[string]interface{}:
|
||||
if name, ok := a["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
case []interface{}:
|
||||
if len(a) > 0 {
|
||||
return extractAuthor(a[0])
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractYield(v interface{}) string {
|
||||
switch y := v.(type) {
|
||||
case string:
|
||||
return y
|
||||
case []interface{}:
|
||||
if len(y) > 0 {
|
||||
if s, ok := y[0].(string); ok {
|
||||
return s
|
||||
}
|
||||
}
|
||||
case float64:
|
||||
return fmt.Sprintf("%.0f", y)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractInstructions(v interface{}) []string {
|
||||
switch inst := v.(type) {
|
||||
case string:
|
||||
return []string{inst}
|
||||
case []interface{}:
|
||||
var steps []string
|
||||
for _, item := range inst {
|
||||
switch step := item.(type) {
|
||||
case string:
|
||||
steps = append(steps, step)
|
||||
case map[string]interface{}:
|
||||
if text, ok := step["text"].(string); ok {
|
||||
steps = append(steps, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
return steps
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractImage(v interface{}) string {
|
||||
switch img := v.(type) {
|
||||
case string:
|
||||
return img
|
||||
case map[string]interface{}:
|
||||
if url, ok := img["url"].(string); ok {
|
||||
return url
|
||||
}
|
||||
case []interface{}:
|
||||
if len(img) > 0 {
|
||||
return extractImage(img[0])
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractFloat(v interface{}) float64 {
|
||||
switch f := v.(type) {
|
||||
case float64:
|
||||
return f
|
||||
case string:
|
||||
var val float64
|
||||
fmt.Sscanf(f, "%f", &val)
|
||||
return val
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form.
|
||||
func formatDuration(iso string) string {
|
||||
if iso == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
iso = strings.TrimPrefix(iso, "PT")
|
||||
iso = strings.TrimPrefix(iso, "pt")
|
||||
|
||||
if iso == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
var parts []string
|
||||
var num string
|
||||
|
||||
for _, c := range iso {
|
||||
switch {
|
||||
case c >= '0' && c <= '9':
|
||||
num += string(c)
|
||||
case c == 'H' || c == 'h':
|
||||
if num != "" {
|
||||
parts = append(parts, num+" hr")
|
||||
num = ""
|
||||
}
|
||||
case c == 'M' || c == 'm':
|
||||
if num != "" {
|
||||
parts = append(parts, num+" min")
|
||||
num = ""
|
||||
}
|
||||
case c == 'S' || c == 's':
|
||||
if num != "" {
|
||||
parts = append(parts, num+" sec")
|
||||
num = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(parts) == 0 {
|
||||
return iso
|
||||
}
|
||||
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func extractRecipeFromDOM(doc extractor.Node) *Recipe {
|
||||
var r Recipe
|
||||
|
||||
// Name — typically in h1 or h2
|
||||
names := doc.Select("h1.recipe-title")
|
||||
if len(names) == 0 {
|
||||
names = doc.Select("h1")
|
||||
}
|
||||
if len(names) > 0 {
|
||||
r.Name, _ = names[0].Text()
|
||||
r.Name = strings.TrimSpace(r.Name)
|
||||
}
|
||||
|
||||
// Description
|
||||
descs := doc.Select("div.recipe-summary p")
|
||||
if len(descs) > 0 {
|
||||
r.Description, _ = descs[0].Text()
|
||||
}
|
||||
|
||||
// Ingredients
|
||||
_ = doc.ForEach("li.ingredient", func(n extractor.Node) error {
|
||||
txt, _ := n.Text()
|
||||
txt = strings.TrimSpace(txt)
|
||||
if txt != "" {
|
||||
r.Ingredients = append(r.Ingredients, txt)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Instructions
|
||||
_ = doc.ForEach("li.instruction", func(n extractor.Node) error {
|
||||
txt, _ := n.Text()
|
||||
txt = strings.TrimSpace(txt)
|
||||
if txt != "" {
|
||||
r.Instructions = append(r.Instructions, txt)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
return &r
|
||||
}
|
||||
Reference in New Issue
Block a user