fix: surface parsing errors instead of silently discarding them
Return errors for required fields (ID, price) and log warnings for optional fields (title, description, unit price) across all site extractors instead of silently discarding them with _ =. Closes #24 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -48,7 +49,11 @@ func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.
|
|||||||
return res, ErrInvalidURL
|
return res, ErrInvalidURL
|
||||||
}
|
}
|
||||||
|
|
||||||
res.ID, _ = strconv.Atoi(a[3])
|
var err error
|
||||||
|
res.ID, err = strconv.Atoi(a[3])
|
||||||
|
if err != nil {
|
||||||
|
return res, fmt.Errorf("failed to parse product ID %q: %w", a[3], err)
|
||||||
|
}
|
||||||
|
|
||||||
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -59,16 +64,25 @@ func (c Config) GetItemFromURL(ctx context.Context, b extractor.Browser, u *url.
|
|||||||
names := doc.Select(".h4")
|
names := doc.Select(".h4")
|
||||||
|
|
||||||
if len(names) > 0 {
|
if len(names) > 0 {
|
||||||
res.Name, _ = names[0].Text()
|
res.Name, err = names[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get product name", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prices := doc.Select(".h2")
|
prices := doc.Select(".h2")
|
||||||
|
|
||||||
if len(prices) > 0 {
|
if len(prices) > 0 {
|
||||||
priceStr, _ := prices[0].Text()
|
priceStr, err := prices[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
return res, fmt.Errorf("failed to get price text: %w", err)
|
||||||
|
}
|
||||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||||
priceStr = strings.TrimSpace(priceStr)
|
priceStr = strings.TrimSpace(priceStr)
|
||||||
res.Price, _ = strconv.ParseFloat(priceStr, 64)
|
res.Price, err = strconv.ParseFloat(priceStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
return res, fmt.Errorf("failed to parse price %q: %w", priceStr, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return res, nil
|
return res, nil
|
||||||
|
|||||||
@@ -2,9 +2,10 @@ package duckduckgo
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
|
||||||
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SearchPage interface {
|
type SearchPage interface {
|
||||||
@@ -44,13 +45,19 @@ func extractResults(doc extractor.Node) ([]Result, error) {
|
|||||||
titles := n.Select("h2")
|
titles := n.Select("h2")
|
||||||
|
|
||||||
if len(titles) != 0 {
|
if len(titles) != 0 {
|
||||||
r.Title, _ = titles[0].Text()
|
r.Title, err = titles[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get result title", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
descriptions := n.Select("span > span")
|
descriptions := n.Select("span > span")
|
||||||
|
|
||||||
if len(descriptions) != 0 {
|
if len(descriptions) != 0 {
|
||||||
r.Description, _ = descriptions[0].Text()
|
r.Description, err = descriptions[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get result description", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res = append(res, r)
|
res = append(res, r)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package google
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||||
@@ -117,13 +118,19 @@ func (c Config) Search(ctx context.Context, b extractor.Browser, query string) (
|
|||||||
titles := s.Select("div > div > div a > h3")
|
titles := s.Select("div > div > div a > h3")
|
||||||
|
|
||||||
if len(titles) != 0 {
|
if len(titles) != 0 {
|
||||||
title, _ = titles[0].Text()
|
title, err = titles[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get result title", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
descs := s.Select("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:not([class])")
|
descs := s.Select("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:not([class])")
|
||||||
|
|
||||||
if len(descs) != 0 {
|
if len(descs) != 0 {
|
||||||
desc, _ = descs[0].Text()
|
desc, err = descs[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get result description", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res = append(res, Result{
|
res = append(res, Result{
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package wegmans
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -54,7 +55,10 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
return Item{}, ErrInvalidURL
|
return Item{}, ErrInvalidURL
|
||||||
}
|
}
|
||||||
|
|
||||||
id, _ := strconv.Atoi(a[2])
|
id, err := strconv.Atoi(a[2])
|
||||||
|
if err != nil {
|
||||||
|
return Item{}, fmt.Errorf("failed to parse product ID %q: %w", a[2], err)
|
||||||
|
}
|
||||||
|
|
||||||
if id == 0 {
|
if id == 0 {
|
||||||
return Item{}, ErrInvalidURL
|
return Item{}, ErrInvalidURL
|
||||||
@@ -67,7 +71,9 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
defer extractor.DeferClose(doc)
|
defer extractor.DeferClose(doc)
|
||||||
|
|
||||||
timeout := 15 * time.Second
|
timeout := 15 * time.Second
|
||||||
_ = doc.WaitForNetworkIdle(&timeout)
|
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
|
||||||
|
slog.Warn("WaitForNetworkIdle failed", "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
res := Item{
|
res := Item{
|
||||||
ID: id,
|
ID: id,
|
||||||
@@ -76,20 +82,29 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
titles := doc.Select("h1[data-testid]")
|
titles := doc.Select("h1[data-testid]")
|
||||||
|
|
||||||
if len(titles) != 0 {
|
if len(titles) != 0 {
|
||||||
res.Name, _ = titles[0].Text()
|
res.Name, err = titles[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get product name", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prices := doc.Select("div.component--product-price:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(2)")
|
prices := doc.Select("div.component--product-price:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(2)")
|
||||||
|
|
||||||
slog.Info("prices", "len", len(prices))
|
slog.Info("prices", "len", len(prices))
|
||||||
if len(prices) != 0 {
|
if len(prices) != 0 {
|
||||||
priceStr, _ := prices[0].Text()
|
priceStr, err := prices[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
return res, fmt.Errorf("failed to get price text: %w", err)
|
||||||
|
}
|
||||||
slog.Info("price", "0", prices[0], "text", priceStr)
|
slog.Info("price", "0", prices[0], "text", priceStr)
|
||||||
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
priceStr = strings.ReplaceAll(priceStr, "$", "")
|
||||||
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
priceStr = strings.ReplaceAll(priceStr, ",", "")
|
||||||
// if there's a "/" in the price, then it's in the format of like "1.99/ea", so split it off
|
// if there's a "/" in the price, then it's in the format of like "1.99/ea", so split it off
|
||||||
priceStr = strings.Split(priceStr, "/")[0]
|
priceStr = strings.Split(priceStr, "/")[0]
|
||||||
price, _ := strconv.ParseFloat(priceStr, 64)
|
price, err := strconv.ParseFloat(priceStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
return res, fmt.Errorf("failed to parse price %q: %w", priceStr, err)
|
||||||
|
}
|
||||||
slog.Info("price", "0", prices[0], "text", priceStr, "price", price)
|
slog.Info("price", "0", prices[0], "text", priceStr, "price", price)
|
||||||
res.Price = price
|
res.Price = price
|
||||||
}
|
}
|
||||||
@@ -97,7 +112,10 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
unitPrices := doc.Select(`div.component--product-price:nth-child(1) span.price-per-unit`)
|
unitPrices := doc.Select(`div.component--product-price:nth-child(1) span.price-per-unit`)
|
||||||
|
|
||||||
if len(unitPrices) != 0 {
|
if len(unitPrices) != 0 {
|
||||||
unitPriceStr, _ := unitPrices[0].Text()
|
unitPriceStr, err := unitPrices[0].Text()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to get unit price text", "err", err)
|
||||||
|
} else {
|
||||||
unitPriceStr = strings.TrimSpace(unitPriceStr)
|
unitPriceStr = strings.TrimSpace(unitPriceStr)
|
||||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, "(", "")
|
unitPriceStr = strings.ReplaceAll(unitPriceStr, "(", "")
|
||||||
unitPriceStr = strings.ReplaceAll(unitPriceStr, ")", "")
|
unitPriceStr = strings.ReplaceAll(unitPriceStr, ")", "")
|
||||||
@@ -108,7 +126,10 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
|
|
||||||
if len(units) > 1 {
|
if len(units) > 1 {
|
||||||
res.Unit = strings.TrimSpace(units[1])
|
res.Unit = strings.TrimSpace(units[1])
|
||||||
res.UnitPrice, _ = strconv.ParseFloat(units[0], 64)
|
res.UnitPrice, err = strconv.ParseFloat(units[0], 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to parse unit price", "text", units[0], "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
// the unit might be like "lb.", so if it ends in a period, then just strip it off
|
// the unit might be like "lb.", so if it ends in a period, then just strip it off
|
||||||
if strings.HasSuffix(res.Unit, ".") {
|
if strings.HasSuffix(res.Unit, ".") {
|
||||||
@@ -116,6 +137,7 @@ func (c Config) GetItemPrice(ctx context.Context, b extractor.Browser, u *url.UR
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
slog.Info("res", "res", res)
|
slog.Info("res", "res", res)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user