feature: add recipe extractor with JSON-LD and DOM parsing #48

Merged
Claude merged 1 commits from feature/allrecipes-extractor into main 2026-02-15 16:52:47 +00:00
2 changed files with 680 additions and 0 deletions
Showing only changes of commit de0a065923 - Show all commits

374
sites/recipe/recipe.go Normal file
View File

@@ -0,0 +1,374 @@
package recipe
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
)
// Recipe holds structured recipe data extracted from a web page.
type Recipe struct {
Name string
Description string
Author string
PrepTime string
CookTime string
TotalTime string
Yield string // servings
Ingredients []string
Instructions []string
ImageURL string
Rating float64
Calories string
SourceURL string
}
// Config holds configuration for the recipe extractor.
type Config struct{}
// DefaultConfig is the default recipe configuration.
var DefaultConfig = Config{}
func (c Config) validate() Config {
return c
}
// ExtractRecipe extracts structured recipe data from any URL.
// Uses JSON-LD structured data when available, falls back to DOM parsing.
func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
c = c.validate()
slog.Info("fetching recipe", "url", url)
doc, err := b.Open(ctx, url, extractor.OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("failed to open recipe page: %w", err)
}
defer extractor.DeferClose(doc)
timeout := 10 * time.Second
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
slog.Warn("WaitForNetworkIdle failed", "err", err)
}
r, err := extractRecipeFromJSONLD(doc)
if err == nil && r.Name != "" {
r.SourceURL = url
return r, nil
}
// Fall back to DOM parsing
r = extractRecipeFromDOM(doc)
r.SourceURL = url
return r, nil
}
// ExtractRecipe is a convenience function using DefaultConfig.
func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) {
return DefaultConfig.ExtractRecipe(ctx, b, url)
}
// jsonLDGraph represents a JSON-LD @graph container.
type jsonLDGraph struct {
Graph []json.RawMessage `json:"@graph"`
}
// jsonLDRecipe represents the JSON-LD Recipe schema fields we extract.
type jsonLDRecipe struct {
Type interface{} `json:"@type"`
Name string `json:"name"`
Description string `json:"description"`
Author interface{} `json:"author"`
PrepTime string `json:"prepTime"`
CookTime string `json:"cookTime"`
TotalTime string `json:"totalTime"`
Yield interface{} `json:"recipeYield"`
Ingredients []string `json:"recipeIngredient"`
Instructions interface{} `json:"recipeInstructions"`
Image interface{} `json:"image"`
Nutrition *struct {
Calories string `json:"calories"`
} `json:"nutrition"`
Rating *struct {
RatingValue interface{} `json:"ratingValue"`
} `json:"aggregateRating"`
}
func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) {
var recipe Recipe
scripts := doc.Select("script[type='application/ld+json']")
if len(scripts) == 0 {
return nil, fmt.Errorf("no JSON-LD scripts found")
}
for _, script := range scripts {
txt, err := script.Text()
if err != nil {
continue
}
txt = strings.TrimSpace(txt)
r, err := parseJSONLDRecipe(txt)
if err == nil && r.Name != "" {
return r, nil
}
}
return &recipe, fmt.Errorf("no Recipe JSON-LD found")
}
func parseJSONLDRecipe(raw string) (*Recipe, error) {
// Try direct Recipe object
var jr jsonLDRecipe
if err := json.Unmarshal([]byte(raw), &jr); err == nil {
if isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
// Try @graph array
var graph jsonLDGraph
if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 {
for _, item := range graph.Graph {
var jr jsonLDRecipe
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
}
// Try array of objects
var arr []json.RawMessage
if err := json.Unmarshal([]byte(raw), &arr); err == nil {
for _, item := range arr {
var jr jsonLDRecipe
if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) {
return convertJSONLDRecipe(&jr), nil
}
}
}
return nil, fmt.Errorf("no Recipe type in JSON-LD")
}
func isRecipeType(t interface{}) bool {
switch v := t.(type) {
case string:
return v == "Recipe"
case []interface{}:
for _, item := range v {
if s, ok := item.(string); ok && s == "Recipe" {
return true
}
}
}
return false
}
func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe {
r := &Recipe{
Name: jr.Name,
Description: jr.Description,
PrepTime: formatDuration(jr.PrepTime),
CookTime: formatDuration(jr.CookTime),
TotalTime: formatDuration(jr.TotalTime),
Ingredients: jr.Ingredients,
}
// Author can be string or object
r.Author = extractAuthor(jr.Author)
// Yield can be string or array
r.Yield = extractYield(jr.Yield)
// Instructions can be string, array of strings, or array of HowToStep objects
r.Instructions = extractInstructions(jr.Instructions)
// Image can be string or object or array
r.ImageURL = extractImage(jr.Image)
// Nutrition
if jr.Nutrition != nil {
r.Calories = jr.Nutrition.Calories
}
// Rating
if jr.Rating != nil {
r.Rating = extractFloat(jr.Rating.RatingValue)
}
return r
}
func extractAuthor(v interface{}) string {
switch a := v.(type) {
case string:
return a
case map[string]interface{}:
if name, ok := a["name"].(string); ok {
return name
}
case []interface{}:
if len(a) > 0 {
return extractAuthor(a[0])
}
}
return ""
}
func extractYield(v interface{}) string {
switch y := v.(type) {
case string:
return y
case []interface{}:
if len(y) > 0 {
if s, ok := y[0].(string); ok {
return s
}
}
case float64:
return fmt.Sprintf("%.0f", y)
}
return ""
}
func extractInstructions(v interface{}) []string {
switch inst := v.(type) {
case string:
return []string{inst}
case []interface{}:
var steps []string
for _, item := range inst {
switch step := item.(type) {
case string:
steps = append(steps, step)
case map[string]interface{}:
if text, ok := step["text"].(string); ok {
steps = append(steps, text)
}
}
}
return steps
}
return nil
}
func extractImage(v interface{}) string {
switch img := v.(type) {
case string:
return img
case map[string]interface{}:
if url, ok := img["url"].(string); ok {
return url
}
case []interface{}:
if len(img) > 0 {
return extractImage(img[0])
}
}
return ""
}
func extractFloat(v interface{}) float64 {
switch f := v.(type) {
case float64:
return f
case string:
var val float64
fmt.Sscanf(f, "%f", &val)
return val
}
return 0
}
// formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form.
func formatDuration(iso string) string {
if iso == "" {
return ""
}
iso = strings.TrimPrefix(iso, "PT")
iso = strings.TrimPrefix(iso, "pt")
if iso == "" {
return ""
}
var parts []string
var num string
for _, c := range iso {
switch {
case c >= '0' && c <= '9':
num += string(c)
case c == 'H' || c == 'h':
if num != "" {
parts = append(parts, num+" hr")
num = ""
}
case c == 'M' || c == 'm':
if num != "" {
parts = append(parts, num+" min")
num = ""
}
case c == 'S' || c == 's':
if num != "" {
parts = append(parts, num+" sec")
num = ""
}
}
}
if len(parts) == 0 {
return iso
}
return strings.Join(parts, " ")
}
func extractRecipeFromDOM(doc extractor.Node) *Recipe {
var r Recipe
// Name — typically in h1 or h2
names := doc.Select("h1.recipe-title")
if len(names) == 0 {
names = doc.Select("h1")
}
if len(names) > 0 {
r.Name, _ = names[0].Text()
r.Name = strings.TrimSpace(r.Name)
}
// Description
descs := doc.Select("div.recipe-summary p")
if len(descs) > 0 {
r.Description, _ = descs[0].Text()
}
// Ingredients
_ = doc.ForEach("li.ingredient", func(n extractor.Node) error {
txt, _ := n.Text()
txt = strings.TrimSpace(txt)
if txt != "" {
r.Ingredients = append(r.Ingredients, txt)
}
return nil
})
// Instructions
_ = doc.ForEach("li.instruction", func(n extractor.Node) error {
txt, _ := n.Text()
txt = strings.TrimSpace(txt)
if txt != "" {
r.Instructions = append(r.Instructions, txt)
}
return nil
})
return &r
}

306
sites/recipe/recipe_test.go Normal file
View File

@@ -0,0 +1,306 @@
package recipe
import (
"context"
"testing"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
)
const sampleJSONLD = `{
"@type": "Recipe",
"name": "Chocolate Chip Cookies",
"description": "The best chocolate chip cookies ever.",
"author": {"@type": "Person", "name": "Jane Smith"},
"prepTime": "PT15M",
"cookTime": "PT10M",
"totalTime": "PT25M",
"recipeYield": "24 cookies",
"recipeIngredient": [
"2 cups flour",
"1 cup sugar",
"1 cup chocolate chips"
],
"recipeInstructions": [
{"@type": "HowToStep", "text": "Preheat oven to 350F."},
{"@type": "HowToStep", "text": "Mix dry ingredients."},
{"@type": "HowToStep", "text": "Bake for 10 minutes."}
],
"image": "https://example.com/cookies.jpg",
"nutrition": {"calories": "250 calories"},
"aggregateRating": {"ratingValue": "4.8"}
}`
const sampleGraphJSONLD = `{
"@graph": [
{"@type": "WebPage", "name": "Recipe Page"},
{
"@type": "Recipe",
"name": "Banana Bread",
"author": "Bob Baker",
"recipeIngredient": ["3 bananas", "2 cups flour"],
"recipeInstructions": "Mix and bake at 350F for 60 minutes."
}
]
}`
const sampleArrayJSONLD = `[
{"@type": "WebSite", "name": "Cooking Blog"},
{
"@type": "Recipe",
"name": "Pancakes",
"recipeYield": ["4 servings"],
"recipeIngredient": ["1 cup flour", "1 egg", "1 cup milk"],
"image": ["https://example.com/pancakes.jpg"]
}
]`
func makeRecipeDoc(jsonLD string) *extractortest.MockDocument {
return &extractortest.MockDocument{
URLValue: "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/",
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
"script[type='application/ld+json']": {
&extractortest.MockNode{TextValue: jsonLD},
},
},
},
}
}
func TestExtractRecipeFromJSONLD(t *testing.T) {
doc := makeRecipeDoc(sampleJSONLD)
r, err := extractRecipeFromJSONLD(doc)
if err != nil {
t.Fatalf("extractRecipeFromJSONLD() error: %v", err)
}
if r.Name != "Chocolate Chip Cookies" {
t.Errorf("Name = %q, want %q", r.Name, "Chocolate Chip Cookies")
}
if r.Author != "Jane Smith" {
t.Errorf("Author = %q, want %q", r.Author, "Jane Smith")
}
if r.PrepTime != "15 min" {
t.Errorf("PrepTime = %q, want %q", r.PrepTime, "15 min")
}
if r.CookTime != "10 min" {
t.Errorf("CookTime = %q, want %q", r.CookTime, "10 min")
}
if r.TotalTime != "25 min" {
t.Errorf("TotalTime = %q, want %q", r.TotalTime, "25 min")
}
if r.Yield != "24 cookies" {
t.Errorf("Yield = %q, want %q", r.Yield, "24 cookies")
}
if len(r.Ingredients) != 3 {
t.Fatalf("len(Ingredients) = %d, want 3", len(r.Ingredients))
}
if r.Ingredients[0] != "2 cups flour" {
t.Errorf("Ingredients[0] = %q, want %q", r.Ingredients[0], "2 cups flour")
}
if len(r.Instructions) != 3 {
t.Fatalf("len(Instructions) = %d, want 3", len(r.Instructions))
}
if r.Instructions[0] != "Preheat oven to 350F." {
t.Errorf("Instructions[0] = %q, want %q", r.Instructions[0], "Preheat oven to 350F.")
}
if r.ImageURL != "https://example.com/cookies.jpg" {
t.Errorf("ImageURL = %q, want %q", r.ImageURL, "https://example.com/cookies.jpg")
}
if r.Calories != "250 calories" {
t.Errorf("Calories = %q, want %q", r.Calories, "250 calories")
}
if r.Rating != 4.8 {
t.Errorf("Rating = %v, want 4.8", r.Rating)
}
}
func TestExtractRecipeFromJSONLD_Graph(t *testing.T) {
doc := makeRecipeDoc(sampleGraphJSONLD)
r, err := extractRecipeFromJSONLD(doc)
if err != nil {
t.Fatalf("extractRecipeFromJSONLD() error: %v", err)
}
if r.Name != "Banana Bread" {
t.Errorf("Name = %q, want %q", r.Name, "Banana Bread")
}
if r.Author != "Bob Baker" {
t.Errorf("Author = %q, want %q", r.Author, "Bob Baker")
}
if len(r.Ingredients) != 2 {
t.Fatalf("len(Ingredients) = %d, want 2", len(r.Ingredients))
}
if len(r.Instructions) != 1 {
t.Fatalf("len(Instructions) = %d, want 1", len(r.Instructions))
}
if r.Instructions[0] != "Mix and bake at 350F for 60 minutes." {
t.Errorf("Instructions[0] = %q, want %q", r.Instructions[0], "Mix and bake at 350F for 60 minutes.")
}
}
func TestExtractRecipeFromJSONLD_Array(t *testing.T) {
doc := makeRecipeDoc(sampleArrayJSONLD)
r, err := extractRecipeFromJSONLD(doc)
if err != nil {
t.Fatalf("extractRecipeFromJSONLD() error: %v", err)
}
if r.Name != "Pancakes" {
t.Errorf("Name = %q, want %q", r.Name, "Pancakes")
}
if r.Yield != "4 servings" {
t.Errorf("Yield = %q, want %q", r.Yield, "4 servings")
}
if r.ImageURL != "https://example.com/pancakes.jpg" {
t.Errorf("ImageURL = %q, want %q", r.ImageURL, "https://example.com/pancakes.jpg")
}
}
func TestExtractRecipeFromJSONLD_NoRecipe(t *testing.T) {
doc := makeRecipeDoc(`{"@type": "WebPage", "name": "Not a recipe"}`)
_, err := extractRecipeFromJSONLD(doc)
if err == nil {
t.Error("expected error for non-Recipe JSON-LD")
}
}
func TestExtractRecipeFromDOM(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
"h1": {
&extractortest.MockNode{TextValue: "Grandma's Apple Pie"},
},
"div.recipe-summary p": {
&extractortest.MockNode{TextValue: "A classic apple pie recipe."},
},
"li.ingredient": {
&extractortest.MockNode{TextValue: "6 apples"},
&extractortest.MockNode{TextValue: "1 cup sugar"},
&extractortest.MockNode{TextValue: "2 pie crusts"},
},
"li.instruction": {
&extractortest.MockNode{TextValue: "Peel and slice apples."},
&extractortest.MockNode{TextValue: "Fill pie crust and bake."},
},
},
},
}
r := extractRecipeFromDOM(doc)
if r.Name != "Grandma's Apple Pie" {
t.Errorf("Name = %q, want %q", r.Name, "Grandma's Apple Pie")
}
if r.Description != "A classic apple pie recipe." {
t.Errorf("Description = %q, want %q", r.Description, "A classic apple pie recipe.")
}
if len(r.Ingredients) != 3 {
t.Fatalf("len(Ingredients) = %d, want 3", len(r.Ingredients))
}
if len(r.Instructions) != 2 {
t.Fatalf("len(Instructions) = %d, want 2", len(r.Instructions))
}
}
func TestExtractRecipe_MockBrowser(t *testing.T) {
doc := makeRecipeDoc(sampleJSONLD)
browser := &extractortest.MockBrowser{
Documents: map[string]*extractortest.MockDocument{
"https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/": doc,
},
}
r, err := DefaultConfig.ExtractRecipe(
context.Background(),
browser,
"https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/",
)
if err != nil {
t.Fatalf("ExtractRecipe() error: %v", err)
}
if r.Name != "Chocolate Chip Cookies" {
t.Errorf("Name = %q, want %q", r.Name, "Chocolate Chip Cookies")
}
if r.SourceURL != "https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/" {
t.Errorf("SourceURL = %q, want recipe URL", r.SourceURL)
}
}
func TestExtractRecipe_FallbackToDOM(t *testing.T) {
doc := &extractortest.MockDocument{
URLValue: "https://example.com/recipe",
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{
"h1": {
&extractortest.MockNode{TextValue: "Simple Recipe"},
},
"li.ingredient": {
&extractortest.MockNode{TextValue: "1 cup flour"},
},
},
},
}
browser := &extractortest.MockBrowser{
Documents: map[string]*extractortest.MockDocument{
"https://example.com/recipe": doc,
},
}
r, err := DefaultConfig.ExtractRecipe(context.Background(), browser, "https://example.com/recipe")
if err != nil {
t.Fatalf("ExtractRecipe() error: %v", err)
}
if r.Name != "Simple Recipe" {
t.Errorf("Name = %q, want %q", r.Name, "Simple Recipe")
}
if len(r.Ingredients) != 1 {
t.Fatalf("len(Ingredients) = %d, want 1", len(r.Ingredients))
}
}
func TestFormatDuration(t *testing.T) {
tests := []struct {
input string
want string
}{
{"PT15M", "15 min"},
{"PT1H30M", "1 hr 30 min"},
{"PT10M", "10 min"},
{"PT2H", "2 hr"},
{"PT45S", "45 sec"},
{"PT1H15M30S", "1 hr 15 min 30 sec"},
{"", ""},
}
for _, tt := range tests {
got := formatDuration(tt.input)
if got != tt.want {
t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want)
}
}
}
func TestExtractRecipeFromDOM_Empty(t *testing.T) {
doc := &extractortest.MockDocument{
MockNode: extractortest.MockNode{
Children: map[string]extractor.Nodes{},
},
}
r := extractRecipeFromDOM(doc)
if r.Name != "" {
t.Errorf("expected empty name, got %q", r.Name)
}
}