Files
go-extractor/sites/recipe/recipe.go
Steve Dudenhoeffer de0a065923
All checks were successful
CI / build (pull_request) Successful in 57s
CI / vet (pull_request) Successful in 1m2s
CI / test (pull_request) Successful in 1m5s
feature: add recipe extractor with JSON-LD and DOM parsing
Add sites/recipe package with ExtractRecipe() that works on any recipe
URL. Parses JSON-LD structured data (@type: Recipe) first, with DOM
fallback. Handles @graph containers, arrays, HowToStep objects, ISO
8601 durations, and various author/yield/image formats.

Closes #29

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 16:52:28 +00:00

375 lines
8.0 KiB
Go

package recipe
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
// Recipe holds structured recipe data extracted from a web page.
type Recipe struct {
Name string
Description string
Author string
PrepTime string
CookTime string
TotalTime string
Yield string // servings
Ingredients []string
Instructions []string
ImageURL string
Rating float64
Calories string
SourceURL string
}
// Config holds configuration for the recipe extractor.
type Config struct{}
// DefaultConfig is the default recipe configuration.
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
// ExtractRecipe extracts structured recipe data from any URL.
// Uses JSON-LD structured data when available, falls back to DOM parsing.
func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
c = c.validate()
slog.Info("fetching recipe", "url", url)
doc, err := b.Open(ctx, url, extractor.OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("failed to open recipe page: %w", err)
}
defer extractor.DeferClose(doc)
timeout := 10 * time.Second
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
slog.Warn("WaitForNetworkIdle failed", "err", err)
}
r, err := extractRecipeFromJSONLD(doc)
if err == nil && r.Name != "" {
r.SourceURL = url
return r, nil
}
// Fall back to DOM parsing
r = extractRecipeFromDOM(doc)
r.SourceURL = url
return r, nil
}
// ExtractRecipe is a convenience function using DefaultConfig.
func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
return DefaultConfig.ExtractRecipe(ctx, b, url)
}
// jsonLDGraph represents a JSON-LD @graph container.
type jsonLDGraph struct {
Graph []json.RawMessage `json:"@graph"`
}
// jsonLDRecipe represents the JSON-LD Recipe schema fields we extract.
type jsonLDRecipe struct {
Type interface{} `json:"@type"`
Name string `json:"name"`
Description string `json:"description"`
Author interface{} `json:"author"`
PrepTime string `json:"prepTime"`
CookTime string `json:"cookTime"`
TotalTime string `json:"totalTime"`
Yield interface{} `json:"recipeYield"`
Ingredients []string `json:"recipeIngredient"`
Instructions interface{} `json:"recipeInstructions"`
Image interface{} `json:"image"`
Nutrition *struct {
Calories string `json:"calories"`
} `json:"nutrition"`
Rating *struct {
RatingValue interface{} `json:"ratingValue"`
} `json:"aggregateRating"`
}
func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) {
var recipe Recipe
scripts := doc.Select("script[type='application/ld+json']")
if len(scripts) == 0 {
return nil, fmt.Errorf("no JSON-LD scripts found")
}
for _, script := range scripts {
txt, err := script.Text()
if err != nil {
continue
}
txt = strings.TrimSpace(txt)
r, err := parseJSONLDRecipe(txt)
if err == nil && r.Name != "" {
return r, nil
}
}
return &recipe, fmt.Errorf("no Recipe JSON-LD found")
}
func parseJSONLDRecipe(raw string) (*Recipe, error) {
// Try direct Recipe object
var jr jsonLDRecipe
if err := json.Unmarshal([]byte(raw), &jr); err == nil {
if isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
// Try @graph array
var graph jsonLDGraph
if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 {
for _, item := range graph.Graph {
var jr jsonLDRecipe
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
}
// Try array of objects
var arr []json.RawMessage
if err := json.Unmarshal([]byte(raw), &arr); err == nil {
for _, item := range arr {
var jr jsonLDRecipe
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
}
return nil, fmt.Errorf("no Recipe type in JSON-LD")
}
func isRecipeType(t interface{}) bool {
switch v := t.(type) {
case string:
return v == "Recipe"
case []interface{}:
for _, item := range v {
if s, ok := item.(string); ok && s == "Recipe" {
return true
}
}
}
return false
}
func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe {
r := &Recipe{
Name: jr.Name,
Description: jr.Description,
PrepTime: formatDuration(jr.PrepTime),
CookTime: formatDuration(jr.CookTime),
TotalTime: formatDuration(jr.TotalTime),
Ingredients: jr.Ingredients,
}
// Author can be string or object
r.Author = extractAuthor(jr.Author)
// Yield can be string or array
r.Yield = extractYield(jr.Yield)
// Instructions can be string, array of strings, or array of HowToStep objects
r.Instructions = extractInstructions(jr.Instructions)
// Image can be string or object or array
r.ImageURL = extractImage(jr.Image)
// Nutrition
if jr.Nutrition != nil {
r.Calories = jr.Nutrition.Calories
}
// Rating
if jr.Rating != nil {
r.Rating = extractFloat(jr.Rating.RatingValue)
}
return r
}
func extractAuthor(v interface{}) string {
switch a := v.(type) {
case string:
return a
case map[string]interface{}:
if name, ok := a["name"].(string); ok {
return name
}
case []interface{}:
if len(a) > 0 {
return extractAuthor(a[0])
}
}
return ""
}
func extractYield(v interface{}) string {
switch y := v.(type) {
case string:
return y
case []interface{}:
if len(y) > 0 {
if s, ok := y[0].(string); ok {
return s
}
}
case float64:
return fmt.Sprintf("%.0f", y)
}
return ""
}
func extractInstructions(v interface{}) []string {
switch inst := v.(type) {
case string:
return []string{inst}
case []interface{}:
var steps []string
for _, item := range inst {
switch step := item.(type) {
case string:
steps = append(steps, step)
case map[string]interface{}:
if text, ok := step["text"].(string); ok {
steps = append(steps, text)
}
}
}
return steps
}
return nil
}
func extractImage(v interface{}) string {
switch img := v.(type) {
case string:
return img
case map[string]interface{}:
if url, ok := img["url"].(string); ok {
return url
}
case []interface{}:
if len(img) > 0 {
return extractImage(img[0])
}
}
return ""
}
func extractFloat(v interface{}) float64 {
switch f := v.(type) {
case float64:
return f
case string:
var val float64
fmt.Sscanf(f, "%f", &val)
return val
}
return 0
}
// formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form.
func formatDuration(iso string) string {
if iso == "" {
return ""
}
iso = strings.TrimPrefix(iso, "PT")
iso = strings.TrimPrefix(iso, "pt")
if iso == "" {
return ""
}
var parts []string
var num string
for _, c := range iso {
switch {
case c >= '0' && c <= '9':
num += string(c)
case c == 'H' || c == 'h':
if num != "" {
parts = append(parts, num+" hr")
num = ""
}
case c == 'M' || c == 'm':
if num != "" {
parts = append(parts, num+" min")
num = ""
}
case c == 'S' || c == 's':
if num != "" {
parts = append(parts, num+" sec")
num = ""
}
}
}
if len(parts) == 0 {
return iso
}
return strings.Join(parts, " ")
}
func extractRecipeFromDOM(doc extractor.Node) *Recipe {
var r Recipe
// Name — typically in h1 or h2
names := doc.Select("h1.recipe-title")
if len(names) == 0 {
names = doc.Select("h1")
}
if len(names) > 0 {
r.Name, _ = names[0].Text()
r.Name = strings.TrimSpace(r.Name)
}
// Description
descs := doc.Select("div.recipe-summary p")
if len(descs) > 0 {
r.Description, _ = descs[0].Text()
}
// Ingredients
_ = doc.ForEach("li.ingredient", func(n extractor.Node) error {
txt, _ := n.Text()
txt = strings.TrimSpace(txt)
if txt != "" {
r.Ingredients = append(r.Ingredients, txt)
}
return nil
})
// Instructions
_ = doc.ForEach("li.instruction", func(n extractor.Node) error {
txt, _ := n.Text()
txt = strings.TrimSpace(txt)
if txt != "" {
r.Instructions = append(r.Instructions, txt)
}
return nil
})
return &r
}