Add sites/recipe package with ExtractRecipe() that works on any recipe URL. Parses JSON-LD structured data (@type: Recipe) first, with DOM fallback. Handles @graph containers, arrays, HowToStep objects, ISO 8601 durations, and various author/yield/image formats. Closes #29 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
375 lines
8.0 KiB
Go
375 lines
8.0 KiB
Go
package recipe
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
)
|
|
|
|
// Recipe holds structured recipe data extracted from a web page.
|
|
type Recipe struct {
|
|
Name string
|
|
Description string
|
|
Author string
|
|
PrepTime string
|
|
CookTime string
|
|
TotalTime string
|
|
Yield string // servings
|
|
Ingredients []string
|
|
Instructions []string
|
|
ImageURL string
|
|
Rating float64
|
|
Calories string
|
|
SourceURL string
|
|
}
|
|
|
|
// Config holds configuration for the recipe extractor.
|
|
type Config struct{}
|
|
|
|
// DefaultConfig is the default recipe configuration.
|
|
var DefaultConfig = Config{}
|
|
|
|
func (c Config) validate() Config {
|
|
return c
|
|
}
|
|
|
|
// ExtractRecipe extracts structured recipe data from any URL.
|
|
// Uses JSON-LD structured data when available, falls back to DOM parsing.
|
|
func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
|
|
c = c.validate()
|
|
|
|
slog.Info("fetching recipe", "url", url)
|
|
doc, err := b.Open(ctx, url, extractor.OpenPageOptions{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open recipe page: %w", err)
|
|
}
|
|
defer extractor.DeferClose(doc)
|
|
|
|
timeout := 10 * time.Second
|
|
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
|
|
slog.Warn("WaitForNetworkIdle failed", "err", err)
|
|
}
|
|
|
|
r, err := extractRecipeFromJSONLD(doc)
|
|
if err == nil && r.Name != "" {
|
|
r.SourceURL = url
|
|
return r, nil
|
|
}
|
|
|
|
// Fall back to DOM parsing
|
|
r = extractRecipeFromDOM(doc)
|
|
r.SourceURL = url
|
|
return r, nil
|
|
}
|
|
|
|
// ExtractRecipe is a convenience function using DefaultConfig.
|
|
func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
|
|
return DefaultConfig.ExtractRecipe(ctx, b, url)
|
|
}
|
|
|
|
// jsonLDGraph represents a JSON-LD @graph container.
|
|
type jsonLDGraph struct {
|
|
Graph []json.RawMessage `json:"@graph"`
|
|
}
|
|
|
|
// jsonLDRecipe represents the JSON-LD Recipe schema fields we extract.
|
|
type jsonLDRecipe struct {
|
|
Type interface{} `json:"@type"`
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
Author interface{} `json:"author"`
|
|
PrepTime string `json:"prepTime"`
|
|
CookTime string `json:"cookTime"`
|
|
TotalTime string `json:"totalTime"`
|
|
Yield interface{} `json:"recipeYield"`
|
|
Ingredients []string `json:"recipeIngredient"`
|
|
Instructions interface{} `json:"recipeInstructions"`
|
|
Image interface{} `json:"image"`
|
|
Nutrition *struct {
|
|
Calories string `json:"calories"`
|
|
} `json:"nutrition"`
|
|
Rating *struct {
|
|
RatingValue interface{} `json:"ratingValue"`
|
|
} `json:"aggregateRating"`
|
|
}
|
|
|
|
func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) {
|
|
var recipe Recipe
|
|
|
|
scripts := doc.Select("script[type='application/ld+json']")
|
|
if len(scripts) == 0 {
|
|
return nil, fmt.Errorf("no JSON-LD scripts found")
|
|
}
|
|
|
|
for _, script := range scripts {
|
|
txt, err := script.Text()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
txt = strings.TrimSpace(txt)
|
|
|
|
r, err := parseJSONLDRecipe(txt)
|
|
if err == nil && r.Name != "" {
|
|
return r, nil
|
|
}
|
|
}
|
|
|
|
return &recipe, fmt.Errorf("no Recipe JSON-LD found")
|
|
}
|
|
|
|
func parseJSONLDRecipe(raw string) (*Recipe, error) {
|
|
// Try direct Recipe object
|
|
var jr jsonLDRecipe
|
|
if err := json.Unmarshal([]byte(raw), &jr); err == nil {
|
|
if isRecipeType(jr.Type) {
|
|
return convertJSONLDRecipe(&jr), nil
|
|
}
|
|
}
|
|
|
|
// Try @graph array
|
|
var graph jsonLDGraph
|
|
if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 {
|
|
for _, item := range graph.Graph {
|
|
var jr jsonLDRecipe
|
|
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
|
|
return convertJSONLDRecipe(&jr), nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try array of objects
|
|
var arr []json.RawMessage
|
|
if err := json.Unmarshal([]byte(raw), &arr); err == nil {
|
|
for _, item := range arr {
|
|
var jr jsonLDRecipe
|
|
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
|
|
return convertJSONLDRecipe(&jr), nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, fmt.Errorf("no Recipe type in JSON-LD")
|
|
}
|
|
|
|
func isRecipeType(t interface{}) bool {
|
|
switch v := t.(type) {
|
|
case string:
|
|
return v == "Recipe"
|
|
case []interface{}:
|
|
for _, item := range v {
|
|
if s, ok := item.(string); ok && s == "Recipe" {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe {
|
|
r := &Recipe{
|
|
Name: jr.Name,
|
|
Description: jr.Description,
|
|
PrepTime: formatDuration(jr.PrepTime),
|
|
CookTime: formatDuration(jr.CookTime),
|
|
TotalTime: formatDuration(jr.TotalTime),
|
|
Ingredients: jr.Ingredients,
|
|
}
|
|
|
|
// Author can be string or object
|
|
r.Author = extractAuthor(jr.Author)
|
|
|
|
// Yield can be string or array
|
|
r.Yield = extractYield(jr.Yield)
|
|
|
|
// Instructions can be string, array of strings, or array of HowToStep objects
|
|
r.Instructions = extractInstructions(jr.Instructions)
|
|
|
|
// Image can be string or object or array
|
|
r.ImageURL = extractImage(jr.Image)
|
|
|
|
// Nutrition
|
|
if jr.Nutrition != nil {
|
|
r.Calories = jr.Nutrition.Calories
|
|
}
|
|
|
|
// Rating
|
|
if jr.Rating != nil {
|
|
r.Rating = extractFloat(jr.Rating.RatingValue)
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
func extractAuthor(v interface{}) string {
|
|
switch a := v.(type) {
|
|
case string:
|
|
return a
|
|
case map[string]interface{}:
|
|
if name, ok := a["name"].(string); ok {
|
|
return name
|
|
}
|
|
case []interface{}:
|
|
if len(a) > 0 {
|
|
return extractAuthor(a[0])
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractYield(v interface{}) string {
|
|
switch y := v.(type) {
|
|
case string:
|
|
return y
|
|
case []interface{}:
|
|
if len(y) > 0 {
|
|
if s, ok := y[0].(string); ok {
|
|
return s
|
|
}
|
|
}
|
|
case float64:
|
|
return fmt.Sprintf("%.0f", y)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractInstructions(v interface{}) []string {
|
|
switch inst := v.(type) {
|
|
case string:
|
|
return []string{inst}
|
|
case []interface{}:
|
|
var steps []string
|
|
for _, item := range inst {
|
|
switch step := item.(type) {
|
|
case string:
|
|
steps = append(steps, step)
|
|
case map[string]interface{}:
|
|
if text, ok := step["text"].(string); ok {
|
|
steps = append(steps, text)
|
|
}
|
|
}
|
|
}
|
|
return steps
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func extractImage(v interface{}) string {
|
|
switch img := v.(type) {
|
|
case string:
|
|
return img
|
|
case map[string]interface{}:
|
|
if url, ok := img["url"].(string); ok {
|
|
return url
|
|
}
|
|
case []interface{}:
|
|
if len(img) > 0 {
|
|
return extractImage(img[0])
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractFloat(v interface{}) float64 {
|
|
switch f := v.(type) {
|
|
case float64:
|
|
return f
|
|
case string:
|
|
var val float64
|
|
fmt.Sscanf(f, "%f", &val)
|
|
return val
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form.
|
|
func formatDuration(iso string) string {
|
|
if iso == "" {
|
|
return ""
|
|
}
|
|
|
|
iso = strings.TrimPrefix(iso, "PT")
|
|
iso = strings.TrimPrefix(iso, "pt")
|
|
|
|
if iso == "" {
|
|
return ""
|
|
}
|
|
|
|
var parts []string
|
|
var num string
|
|
|
|
for _, c := range iso {
|
|
switch {
|
|
case c >= '0' && c <= '9':
|
|
num += string(c)
|
|
case c == 'H' || c == 'h':
|
|
if num != "" {
|
|
parts = append(parts, num+" hr")
|
|
num = ""
|
|
}
|
|
case c == 'M' || c == 'm':
|
|
if num != "" {
|
|
parts = append(parts, num+" min")
|
|
num = ""
|
|
}
|
|
case c == 'S' || c == 's':
|
|
if num != "" {
|
|
parts = append(parts, num+" sec")
|
|
num = ""
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(parts) == 0 {
|
|
return iso
|
|
}
|
|
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
func extractRecipeFromDOM(doc extractor.Node) *Recipe {
|
|
var r Recipe
|
|
|
|
// Name — typically in h1 or h2
|
|
names := doc.Select("h1.recipe-title")
|
|
if len(names) == 0 {
|
|
names = doc.Select("h1")
|
|
}
|
|
if len(names) > 0 {
|
|
r.Name, _ = names[0].Text()
|
|
r.Name = strings.TrimSpace(r.Name)
|
|
}
|
|
|
|
// Description
|
|
descs := doc.Select("div.recipe-summary p")
|
|
if len(descs) > 0 {
|
|
r.Description, _ = descs[0].Text()
|
|
}
|
|
|
|
// Ingredients
|
|
_ = doc.ForEach("li.ingredient", func(n extractor.Node) error {
|
|
txt, _ := n.Text()
|
|
txt = strings.TrimSpace(txt)
|
|
if txt != "" {
|
|
r.Ingredients = append(r.Ingredients, txt)
|
|
}
|
|
return nil
|
|
})
|
|
|
|
// Instructions
|
|
_ = doc.ForEach("li.instruction", func(n extractor.Node) error {
|
|
txt, _ := n.Text()
|
|
txt = strings.TrimSpace(txt)
|
|
if txt != "" {
|
|
r.Instructions = append(r.Instructions, txt)
|
|
}
|
|
return nil
|
|
})
|
|
|
|
return &r
|
|
}
|