From de0a065923aa7f71ebefa93debaa118cb1f8a53c Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 15 Feb 2026 16:52:28 +0000 Subject: [PATCH] feature: add recipe extractor with JSON-LD and DOM parsing Add sites/recipe package with ExtractRecipe() that works on any recipe URL. Parses JSON-LD structured data (@type: Recipe) first, with DOM fallback. Handles @graph containers, arrays, HowToStep objects, ISO 8601 durations, and various author/yield/image formats. Closes #29 Co-Authored-By: Claude Opus 4.6 --- sites/recipe/recipe.go | 374 ++++++++++++++++++++++++++++++++++++ sites/recipe/recipe_test.go | 306 +++++++++++++++++++++++++++++ 2 files changed, 680 insertions(+) create mode 100644 sites/recipe/recipe.go create mode 100644 sites/recipe/recipe_test.go diff --git a/sites/recipe/recipe.go b/sites/recipe/recipe.go new file mode 100644 index 0000000..818cf8b --- /dev/null +++ b/sites/recipe/recipe.go @@ -0,0 +1,374 @@ +package recipe + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +// Recipe holds structured recipe data extracted from a web page. +type Recipe struct { + Name string + Description string + Author string + PrepTime string + CookTime string + TotalTime string + Yield string // servings + Ingredients []string + Instructions []string + ImageURL string + Rating float64 + Calories string + SourceURL string +} + +// Config holds configuration for the recipe extractor. +type Config struct{} + +// DefaultConfig is the default recipe configuration. +var DefaultConfig = Config{} + +func (c Config) validate() Config { + return c +} + +// ExtractRecipe extracts structured recipe data from any URL. +// Uses JSON-LD structured data when available, falls back to DOM parsing. +func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) { + c = c.validate() + + slog.Info("fetching recipe", "url", url) + doc, err := b.Open(ctx, url, extractor.OpenPageOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to open recipe page: %w", err) + } + defer extractor.DeferClose(doc) + + timeout := 10 * time.Second + if err := doc.WaitForNetworkIdle(&timeout); err != nil { + slog.Warn("WaitForNetworkIdle failed", "err", err) + } + + r, err := extractRecipeFromJSONLD(doc) + if err == nil && r.Name != "" { + r.SourceURL = url + return r, nil + } + + // Fall back to DOM parsing + r = extractRecipeFromDOM(doc) + r.SourceURL = url + return r, nil +} + +// ExtractRecipe is a convenience function using DefaultConfig. +func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) { + return DefaultConfig.ExtractRecipe(ctx, b, url) +} + +// jsonLDGraph represents a JSON-LD @graph container. +type jsonLDGraph struct { + Graph []json.RawMessage `json:"@graph"` +} + +// jsonLDRecipe represents the JSON-LD Recipe schema fields we extract. +type jsonLDRecipe struct { + Type interface{} `json:"@type"` + Name string `json:"name"` + Description string `json:"description"` + Author interface{} `json:"author"` + PrepTime string `json:"prepTime"` + CookTime string `json:"cookTime"` + TotalTime string `json:"totalTime"` + Yield interface{} `json:"recipeYield"` + Ingredients []string `json:"recipeIngredient"` + Instructions interface{} `json:"recipeInstructions"` + Image interface{} `json:"image"` + Nutrition *struct { + Calories string `json:"calories"` + } `json:"nutrition"` + Rating *struct { + RatingValue interface{} `json:"ratingValue"` + } `json:"aggregateRating"` +} + +func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) { + var recipe Recipe + + scripts := doc.Select("script[type='application/ld+json']") + if len(scripts) == 0 { + return nil, fmt.Errorf("no JSON-LD scripts found") + } + + for _, script := range scripts { + txt, err := script.Text() + if err != nil { + continue + } + txt = strings.TrimSpace(txt) + + r, err := parseJSONLDRecipe(txt) + if err == nil && r.Name != "" { + return r, nil + } + } + + return &recipe, fmt.Errorf("no Recipe JSON-LD found") +} + +func parseJSONLDRecipe(raw string) (*Recipe, error) { + // Try direct Recipe object + var jr jsonLDRecipe + if err := json.Unmarshal([]byte(raw), &jr); err == nil { + if isRecipeType(jr.Type) { + return convertJSONLDRecipe(&jr), nil + } + } + + // Try @graph array + var graph jsonLDGraph + if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 { + for _, item := range graph.Graph { + var jr jsonLDRecipe + if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) { + return convertJSONLDRecipe(&jr), nil + } + } + } + + // Try array of objects + var arr []json.RawMessage + if err := json.Unmarshal([]byte(raw), &arr); err == nil { + for _, item := range arr { + var jr jsonLDRecipe + if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) { + return convertJSONLDRecipe(&jr), nil + } + } + } + + return nil, fmt.Errorf("no Recipe type in JSON-LD") +} + +func isRecipeType(t interface{}) bool { + switch v := t.(type) { + case string: + return v == "Recipe" + case []interface{}: + for _, item := range v { + if s, ok := item.(string); ok && s == "Recipe" { + return true + } + } + } + return false +} + +func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe { + r := &Recipe{ + Name: jr.Name, + Description: jr.Description, + PrepTime: formatDuration(jr.PrepTime), + CookTime: formatDuration(jr.CookTime), + TotalTime: formatDuration(jr.TotalTime), + Ingredients: jr.Ingredients, + } + + // Author can be string or object + r.Author = extractAuthor(jr.Author) + + // Yield can be string or array + r.Yield = extractYield(jr.Yield) + + // Instructions can be string, array of strings, or array of HowToStep objects + r.Instructions = extractInstructions(jr.Instructions) + + // Image can be string or object or array + r.ImageURL = extractImage(jr.Image) + + // Nutrition + if jr.Nutrition != nil { + r.Calories = jr.Nutrition.Calories + } + + // Rating + if jr.Rating != nil { + r.Rating = extractFloat(jr.Rating.RatingValue) + } + + return r +} + +func extractAuthor(v interface{}) string { + switch a := v.(type) { + case string: + return a + case map[string]interface{}: + if name, ok := a["name"].(string); ok { + return name + } + case []interface{}: + if len(a) > 0 { + return extractAuthor(a[0]) + } + } + return "" +} + +func extractYield(v interface{}) string { + switch y := v.(type) { + case string: + return y + case []interface{}: + if len(y) > 0 { + if s, ok := y[0].(string); ok { + return s + } + } + case float64: + return fmt.Sprintf("%.0f", y) + } + return "" +} + +func extractInstructions(v interface{}) []string { + switch inst := v.(type) { + case string: + return []string{inst} + case []interface{}: + var steps []string + for _, item := range inst { + switch step := item.(type) { + case string: + steps = append(steps, step) + case map[string]interface{}: + if text, ok := step["text"].(string); ok { + steps = append(steps, text) + } + } + } + return steps + } + return nil +} + +func extractImage(v interface{}) string { + switch img := v.(type) { + case string: + return img + case map[string]interface{}: + if url, ok := img["url"].(string); ok { + return url + } + case []interface{}: + if len(img) > 0 { + return extractImage(img[0]) + } + } + return "" +} + +func extractFloat(v interface{}) float64 { + switch f := v.(type) { + case float64: + return f + case string: + var val float64 + fmt.Sscanf(f, "%f", &val) + return val + } + return 0 +} + +// formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form. +func formatDuration(iso string) string { + if iso == "" { + return "" + } + + iso = strings.TrimPrefix(iso, "PT") + iso = strings.TrimPrefix(iso, "pt") + + if iso == "" { + return "" + } + + var parts []string + var num string + + for _, c := range iso { + switch { + case c >= '0' && c <= '9': + num += string(c) + case c == 'H' || c == 'h': + if num != "" { + parts = append(parts, num+" hr") + num = "" + } + case c == 'M' || c == 'm': + if num != "" { + parts = append(parts, num+" min") + num = "" + } + case c == 'S' || c == 's': + if num != "" { + parts = append(parts, num+" sec") + num = "" + } + } + } + + if len(parts) == 0 { + return iso + } + + return strings.Join(parts, " ") +} + +func extractRecipeFromDOM(doc extractor.Node) *Recipe { + var r Recipe + + // Name — typically in h1 or h2 + names := doc.Select("h1.recipe-title") + if len(names) == 0 { + names = doc.Select("h1") + } + if len(names) > 0 { + r.Name, _ = names[0].Text() + r.Name = strings.TrimSpace(r.Name) + } + + // Description + descs := doc.Select("div.recipe-summary p") + if len(descs) > 0 { + r.Description, _ = descs[0].Text() + } + + // Ingredients + _ = doc.ForEach("li.ingredient", func(n extractor.Node) error { + txt, _ := n.Text() + txt = strings.TrimSpace(txt) + if txt != "" { + r.Ingredients = append(r.Ingredients, txt) + } + return nil + }) + + // Instructions + _ = doc.ForEach("li.instruction", func(n extractor.Node) error { + txt, _ := n.Text() + txt = strings.TrimSpace(txt) + if txt != "" { + r.Instructions = append(r.Instructions, txt) + } + return nil + }) + + return &r +} diff --git a/sites/recipe/recipe_test.go b/sites/recipe/recipe_test.go new file mode 100644 index 0000000..9bea371 --- /dev/null +++ b/sites/recipe/recipe_test.go @@ -0,0 +1,306 @@ +package recipe + +import ( + "context" + "testing" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + "gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest" +) + +const sampleJSONLD = `{ + "@type": "Recipe", + "name": "Chocolate Chip Cookies", + "description": "The best chocolate chip cookies ever.", + "author": {"@type": "Person", "name": "Jane Smith"}, + "prepTime": "PT15M", + "cookTime": "PT10M", + "totalTime": "PT25M", + "recipeYield": "24 cookies", + "recipeIngredient": [ + "2 cups flour", + "1 cup sugar", + "1 cup chocolate chips" + ], + "recipeInstructions": [ + {"@type": "HowToStep", "text": "Preheat oven to 350F."}, + {"@type": "HowToStep", "text": "Mix dry ingredients."}, + {"@type": "HowToStep", "text": "Bake for 10 minutes."} + ], + "image": "https://example.com/cookies.jpg", + "nutrition": {"calories": "250 calories"}, + "aggregateRating": {"ratingValue": "4.8"} +}` + +const sampleGraphJSONLD = `{ + "@graph": [ + {"@type": "WebPage", "name": "Recipe Page"}, + { + "@type": "Recipe", + "name": "Banana Bread", + "author": "Bob Baker", + "recipeIngredient": ["3 bananas", "2 cups flour"], + "recipeInstructions": "Mix and bake at 350F for 60 minutes." + } + ] +}` + +const sampleArrayJSONLD = `[ + {"@type": "WebSite", "name": "Cooking Blog"}, + { + "@type": "Recipe", + "name": "Pancakes", + "recipeYield": ["4 servings"], + "recipeIngredient": ["1 cup flour", "1 egg", "1 cup milk"], + "image": ["https://example.com/pancakes.jpg"] + } +]` + +func makeRecipeDoc(jsonLD string) *extractortest.MockDocument { + return &extractortest.MockDocument{ + URLValue: "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/", + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "script[type='application/ld+json']": { + &extractortest.MockNode{TextValue: jsonLD}, + }, + }, + }, + } +} + +func TestExtractRecipeFromJSONLD(t *testing.T) { + doc := makeRecipeDoc(sampleJSONLD) + + r, err := extractRecipeFromJSONLD(doc) + if err != nil { + t.Fatalf("extractRecipeFromJSONLD() error: %v", err) + } + + if r.Name != "Chocolate Chip Cookies" { + t.Errorf("Name = %q, want %q", r.Name, "Chocolate Chip Cookies") + } + if r.Author != "Jane Smith" { + t.Errorf("Author = %q, want %q", r.Author, "Jane Smith") + } + if r.PrepTime != "15 min" { + t.Errorf("PrepTime = %q, want %q", r.PrepTime, "15 min") + } + if r.CookTime != "10 min" { + t.Errorf("CookTime = %q, want %q", r.CookTime, "10 min") + } + if r.TotalTime != "25 min" { + t.Errorf("TotalTime = %q, want %q", r.TotalTime, "25 min") + } + if r.Yield != "24 cookies" { + t.Errorf("Yield = %q, want %q", r.Yield, "24 cookies") + } + if len(r.Ingredients) != 3 { + t.Fatalf("len(Ingredients) = %d, want 3", len(r.Ingredients)) + } + if r.Ingredients[0] != "2 cups flour" { + t.Errorf("Ingredients[0] = %q, want %q", r.Ingredients[0], "2 cups flour") + } + if len(r.Instructions) != 3 { + t.Fatalf("len(Instructions) = %d, want 3", len(r.Instructions)) + } + if r.Instructions[0] != "Preheat oven to 350F." { + t.Errorf("Instructions[0] = %q, want %q", r.Instructions[0], "Preheat oven to 350F.") + } + if r.ImageURL != "https://example.com/cookies.jpg" { + t.Errorf("ImageURL = %q, want %q", r.ImageURL, "https://example.com/cookies.jpg") + } + if r.Calories != "250 calories" { + t.Errorf("Calories = %q, want %q", r.Calories, "250 calories") + } + if r.Rating != 4.8 { + t.Errorf("Rating = %v, want 4.8", r.Rating) + } +} + +func TestExtractRecipeFromJSONLD_Graph(t *testing.T) { + doc := makeRecipeDoc(sampleGraphJSONLD) + + r, err := extractRecipeFromJSONLD(doc) + if err != nil { + t.Fatalf("extractRecipeFromJSONLD() error: %v", err) + } + + if r.Name != "Banana Bread" { + t.Errorf("Name = %q, want %q", r.Name, "Banana Bread") + } + if r.Author != "Bob Baker" { + t.Errorf("Author = %q, want %q", r.Author, "Bob Baker") + } + if len(r.Ingredients) != 2 { + t.Fatalf("len(Ingredients) = %d, want 2", len(r.Ingredients)) + } + if len(r.Instructions) != 1 { + t.Fatalf("len(Instructions) = %d, want 1", len(r.Instructions)) + } + if r.Instructions[0] != "Mix and bake at 350F for 60 minutes." { + t.Errorf("Instructions[0] = %q, want %q", r.Instructions[0], "Mix and bake at 350F for 60 minutes.") + } +} + +func TestExtractRecipeFromJSONLD_Array(t *testing.T) { + doc := makeRecipeDoc(sampleArrayJSONLD) + + r, err := extractRecipeFromJSONLD(doc) + if err != nil { + t.Fatalf("extractRecipeFromJSONLD() error: %v", err) + } + + if r.Name != "Pancakes" { + t.Errorf("Name = %q, want %q", r.Name, "Pancakes") + } + if r.Yield != "4 servings" { + t.Errorf("Yield = %q, want %q", r.Yield, "4 servings") + } + if r.ImageURL != "https://example.com/pancakes.jpg" { + t.Errorf("ImageURL = %q, want %q", r.ImageURL, "https://example.com/pancakes.jpg") + } +} + +func TestExtractRecipeFromJSONLD_NoRecipe(t *testing.T) { + doc := makeRecipeDoc(`{"@type": "WebPage", "name": "Not a recipe"}`) + + _, err := extractRecipeFromJSONLD(doc) + if err == nil { + t.Error("expected error for non-Recipe JSON-LD") + } +} + +func TestExtractRecipeFromDOM(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "h1": { + &extractortest.MockNode{TextValue: "Grandma's Apple Pie"}, + }, + "div.recipe-summary p": { + &extractortest.MockNode{TextValue: "A classic apple pie recipe."}, + }, + "li.ingredient": { + &extractortest.MockNode{TextValue: "6 apples"}, + &extractortest.MockNode{TextValue: "1 cup sugar"}, + &extractortest.MockNode{TextValue: "2 pie crusts"}, + }, + "li.instruction": { + &extractortest.MockNode{TextValue: "Peel and slice apples."}, + &extractortest.MockNode{TextValue: "Fill pie crust and bake."}, + }, + }, + }, + } + + r := extractRecipeFromDOM(doc) + + if r.Name != "Grandma's Apple Pie" { + t.Errorf("Name = %q, want %q", r.Name, "Grandma's Apple Pie") + } + if r.Description != "A classic apple pie recipe." { + t.Errorf("Description = %q, want %q", r.Description, "A classic apple pie recipe.") + } + if len(r.Ingredients) != 3 { + t.Fatalf("len(Ingredients) = %d, want 3", len(r.Ingredients)) + } + if len(r.Instructions) != 2 { + t.Fatalf("len(Instructions) = %d, want 2", len(r.Instructions)) + } +} + +func TestExtractRecipe_MockBrowser(t *testing.T) { + doc := makeRecipeDoc(sampleJSONLD) + + browser := &extractortest.MockBrowser{ + Documents: map[string]*extractortest.MockDocument{ + "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/": doc, + }, + } + + r, err := DefaultConfig.ExtractRecipe( + context.Background(), + browser, + "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/", + ) + if err != nil { + t.Fatalf("ExtractRecipe() error: %v", err) + } + + if r.Name != "Chocolate Chip Cookies" { + t.Errorf("Name = %q, want %q", r.Name, "Chocolate Chip Cookies") + } + if r.SourceURL != "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/" { + t.Errorf("SourceURL = %q, want recipe URL", r.SourceURL) + } +} + +func TestExtractRecipe_FallbackToDOM(t *testing.T) { + doc := &extractortest.MockDocument{ + URLValue: "https://example.com/recipe", + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "h1": { + &extractortest.MockNode{TextValue: "Simple Recipe"}, + }, + "li.ingredient": { + &extractortest.MockNode{TextValue: "1 cup flour"}, + }, + }, + }, + } + + browser := &extractortest.MockBrowser{ + Documents: map[string]*extractortest.MockDocument{ + "https://example.com/recipe": doc, + }, + } + + r, err := DefaultConfig.ExtractRecipe(context.Background(), browser, "https://example.com/recipe") + if err != nil { + t.Fatalf("ExtractRecipe() error: %v", err) + } + + if r.Name != "Simple Recipe" { + t.Errorf("Name = %q, want %q", r.Name, "Simple Recipe") + } + if len(r.Ingredients) != 1 { + t.Fatalf("len(Ingredients) = %d, want 1", len(r.Ingredients)) + } +} + +func TestFormatDuration(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"PT15M", "15 min"}, + {"PT1H30M", "1 hr 30 min"}, + {"PT10M", "10 min"}, + {"PT2H", "2 hr"}, + {"PT45S", "45 sec"}, + {"PT1H15M30S", "1 hr 15 min 30 sec"}, + {"", ""}, + } + + for _, tt := range tests { + got := formatDuration(tt.input) + if got != tt.want { + t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +func TestExtractRecipeFromDOM_Empty(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{}, + }, + } + + r := extractRecipeFromDOM(doc) + if r.Name != "" { + t.Errorf("expected empty name, got %q", r.Name) + } +} -- 2.49.1