package recipe import ( "context" "encoding/json" "fmt" "log/slog" "strings" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" ) // Recipe holds structured recipe data extracted from a web page. type Recipe struct { Name string Description string Author string PrepTime string CookTime string TotalTime string Yield string // servings Ingredients []string Instructions []string ImageURL string Rating float64 Calories string SourceURL string } // Config holds configuration for the recipe extractor. type Config struct{} // DefaultConfig is the default recipe configuration. var DefaultConfig = Config{} func (c Config) validate() Config { return c } // ExtractRecipe extracts structured recipe data from any URL. // Uses JSON-LD structured data when available, falls back to DOM parsing. func (c Config) ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) { c = c.validate() slog.Info("fetching recipe", "url", url) doc, err := b.Open(ctx, url, extractor.OpenPageOptions{}) if err != nil { return nil, fmt.Errorf("failed to open recipe page: %w", err) } defer extractor.DeferClose(doc) timeout := 10 * time.Second if err := doc.WaitForNetworkIdle(&timeout); err != nil { slog.Warn("WaitForNetworkIdle failed", "err", err) } r, err := extractRecipeFromJSONLD(doc) if err == nil && r.Name != "" { r.SourceURL = url return r, nil } // Fall back to DOM parsing r = extractRecipeFromDOM(doc) r.SourceURL = url return r, nil } // ExtractRecipe is a convenience function using DefaultConfig. func ExtractRecipe(ctx context.Context, b extractor.Browser, url string) (*Recipe, error) { return DefaultConfig.ExtractRecipe(ctx, b, url) } // jsonLDGraph represents a JSON-LD @graph container. type jsonLDGraph struct { Graph []json.RawMessage `json:"@graph"` } // jsonLDRecipe represents the JSON-LD Recipe schema fields we extract. type jsonLDRecipe struct { Type interface{} `json:"@type"` Name string `json:"name"` Description string `json:"description"` Author interface{} `json:"author"` PrepTime string `json:"prepTime"` CookTime string `json:"cookTime"` TotalTime string `json:"totalTime"` Yield interface{} `json:"recipeYield"` Ingredients []string `json:"recipeIngredient"` Instructions interface{} `json:"recipeInstructions"` Image interface{} `json:"image"` Nutrition *struct { Calories string `json:"calories"` } `json:"nutrition"` Rating *struct { RatingValue interface{} `json:"ratingValue"` } `json:"aggregateRating"` } func extractRecipeFromJSONLD(doc extractor.Node) (*Recipe, error) { var recipe Recipe scripts := doc.Select("script[type='application/ld+json']") if len(scripts) == 0 { return nil, fmt.Errorf("no JSON-LD scripts found") } for _, script := range scripts { txt, err := script.Text() if err != nil { continue } txt = strings.TrimSpace(txt) r, err := parseJSONLDRecipe(txt) if err == nil && r.Name != "" { return r, nil } } return &recipe, fmt.Errorf("no Recipe JSON-LD found") } func parseJSONLDRecipe(raw string) (*Recipe, error) { // Try direct Recipe object var jr jsonLDRecipe if err := json.Unmarshal([]byte(raw), &jr); err == nil { if isRecipeType(jr.Type) { return convertJSONLDRecipe(&jr), nil } } // Try @graph array var graph jsonLDGraph if err := json.Unmarshal([]byte(raw), &graph); err == nil && len(graph.Graph) > 0 { for _, item := range graph.Graph { var jr jsonLDRecipe if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) { return convertJSONLDRecipe(&jr), nil } } } // Try array of objects var arr []json.RawMessage if err := json.Unmarshal([]byte(raw), &arr); err == nil { for _, item := range arr { var jr jsonLDRecipe if err := json.Unmarshal(item, &jr); err == nil && isRecipeType(jr.Type) { return convertJSONLDRecipe(&jr), nil } } } return nil, fmt.Errorf("no Recipe type in JSON-LD") } func isRecipeType(t interface{}) bool { switch v := t.(type) { case string: return v == "Recipe" case []interface{}: for _, item := range v { if s, ok := item.(string); ok && s == "Recipe" { return true } } } return false } func convertJSONLDRecipe(jr *jsonLDRecipe) *Recipe { r := &Recipe{ Name: jr.Name, Description: jr.Description, PrepTime: formatDuration(jr.PrepTime), CookTime: formatDuration(jr.CookTime), TotalTime: formatDuration(jr.TotalTime), Ingredients: jr.Ingredients, } // Author can be string or object r.Author = extractAuthor(jr.Author) // Yield can be string or array r.Yield = extractYield(jr.Yield) // Instructions can be string, array of strings, or array of HowToStep objects r.Instructions = extractInstructions(jr.Instructions) // Image can be string or object or array r.ImageURL = extractImage(jr.Image) // Nutrition if jr.Nutrition != nil { r.Calories = jr.Nutrition.Calories } // Rating if jr.Rating != nil { r.Rating = extractFloat(jr.Rating.RatingValue) } return r } func extractAuthor(v interface{}) string { switch a := v.(type) { case string: return a case map[string]interface{}: if name, ok := a["name"].(string); ok { return name } case []interface{}: if len(a) > 0 { return extractAuthor(a[0]) } } return "" } func extractYield(v interface{}) string { switch y := v.(type) { case string: return y case []interface{}: if len(y) > 0 { if s, ok := y[0].(string); ok { return s } } case float64: return fmt.Sprintf("%.0f", y) } return "" } func extractInstructions(v interface{}) []string { switch inst := v.(type) { case string: return []string{inst} case []interface{}: var steps []string for _, item := range inst { switch step := item.(type) { case string: steps = append(steps, step) case map[string]interface{}: if text, ok := step["text"].(string); ok { steps = append(steps, text) } } } return steps } return nil } func extractImage(v interface{}) string { switch img := v.(type) { case string: return img case map[string]interface{}: if url, ok := img["url"].(string); ok { return url } case []interface{}: if len(img) > 0 { return extractImage(img[0]) } } return "" } func extractFloat(v interface{}) float64 { switch f := v.(type) { case float64: return f case string: var val float64 fmt.Sscanf(f, "%f", &val) return val } return 0 } // formatDuration converts ISO 8601 duration (PT1H30M) to human-readable form. func formatDuration(iso string) string { if iso == "" { return "" } iso = strings.TrimPrefix(iso, "PT") iso = strings.TrimPrefix(iso, "pt") if iso == "" { return "" } var parts []string var num string for _, c := range iso { switch { case c >= '0' && c <= '9': num += string(c) case c == 'H' || c == 'h': if num != "" { parts = append(parts, num+" hr") num = "" } case c == 'M' || c == 'm': if num != "" { parts = append(parts, num+" min") num = "" } case c == 'S' || c == 's': if num != "" { parts = append(parts, num+" sec") num = "" } } } if len(parts) == 0 { return iso } return strings.Join(parts, " ") } func extractRecipeFromDOM(doc extractor.Node) *Recipe { var r Recipe // Name — typically in h1 or h2 names := doc.Select("h1.recipe-title") if len(names) == 0 { names = doc.Select("h1") } if len(names) > 0 { r.Name, _ = names[0].Text() r.Name = strings.TrimSpace(r.Name) } // Description descs := doc.Select("div.recipe-summary p") if len(descs) > 0 { r.Description, _ = descs[0].Text() } // Ingredients _ = doc.ForEach("li.ingredient", func(n extractor.Node) error { txt, _ := n.Text() txt = strings.TrimSpace(txt) if txt != "" { r.Ingredients = append(r.Ingredients, txt) } return nil }) // Instructions _ = doc.ForEach("li.instruction", func(n extractor.Node) error { txt, _ := n.Text() txt = strings.TrimSpace(txt) if txt != "" { r.Instructions = append(r.Instructions, txt) } return nil }) return &r }