Files
go-extractor/sites/duckduckgo/weather.go
Steve Dudenhoeffer 8c2848246b
All checks were successful
CI / build (pull_request) Successful in 1m11s
CI / vet (pull_request) Successful in 1m12s
CI / test (pull_request) Successful in 1m17s
fix: use structural selectors for DDG weather to handle advisory banners
The weather extractor used positional CSS selectors (div:first-child,
div:nth-child(2)) to locate the header and hourly container within the
widget section. When DuckDuckGo inserts advisory banners (e.g. wind
advisory), the extra div shifts positions and breaks extraction of
current temp, hourly data, humidity, and wind.

Replace with structural selectors:
- div:not(:has(ul)) for the header (first div without a list)
- div:has(> ul) for the hourly container (div with direct ul child)

These match elements by their content structure rather than position,
so advisory banners no longer break extraction.

Fixes #64

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 18:22:53 +00:00

218 lines
6.4 KiB
Go

package duckduckgo
import (
"context"
"fmt"
"log/slog"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/internal/parse"
)
// WeatherData holds structured weather information extracted from DuckDuckGo.
type WeatherData struct {
Location string
CurrentTemp float64
Condition string
HighTemp float64
LowTemp float64
Humidity string
Wind string
Forecast []DayForecast
Hourly []HourlyForecast
}
// DayForecast holds a single day's forecast.
type DayForecast struct {
Day string
HighTemp float64
LowTemp float64
Condition string
Precipitation int // percentage 0-100, -1 if unavailable
IconHint string // icon type from element attributes (e.g. "PartlyCloudy", "Snow")
}
// HourlyForecast holds a single hour's forecast.
type HourlyForecast struct {
Time string
Temp float64
Condition string
Precipitation int // percentage 0-100, -1 if unavailable
IconHint string // icon type from element attributes (e.g. "MostlyCloudy", "Rain")
}
// GetWeather extracts weather data from DuckDuckGo's weather widget.
func (c Config) GetWeather(ctx context.Context, b extractor.Browser, city string) (*WeatherData, error) {
c = c.validate()
u := c.ToSearchURL("weather " + city)
slog.Info("fetching weather", "url", u, "city", city)
doc, err := b.Open(ctx, u.String(), extractor.OpenPageOptions{})
if err != nil {
return nil, fmt.Errorf("failed to open weather page: %w", err)
}
defer extractor.DeferClose(doc)
timeout := 10 * time.Second
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
slog.Warn("WaitForNetworkIdle failed", "err", err)
}
return extractWeather(doc)
}
// GetWeather is a convenience function using DefaultConfig.
func GetWeather(ctx context.Context, b extractor.Browser, city string) (*WeatherData, error) {
return DefaultConfig.GetWeather(ctx, b, city)
}
func extractWeather(doc extractor.Node) (*WeatherData, error) {
var data WeatherData
// DuckDuckGo's weather widget uses randomized CSS class names (CSS modules),
// so we identify elements by structural selectors and image src attributes.
// The widget is an article element containing weatherkit icon images.
widget := doc.SelectFirst("article:has(img[src*='weatherkit'])")
if widget == nil {
return &data, nil
}
section := widget.SelectFirst("section")
if section == nil {
return &data, nil
}
// Header: condition and location
// Structure: section > div > [div(toggle), p(condition), p(location)]
// Use :not(:has(ul)) to skip the hourly container div and avoid breaking
// when advisory banners (e.g. wind advisory) insert extra divs.
header := section.SelectFirst("div:not(:has(ul))")
if header != nil {
ps := header.Select("p")
if len(ps) >= 2 {
data.Condition, _ = ps[0].Text()
data.Location, _ = ps[1].Text()
} else if len(ps) == 1 {
data.Condition, _ = ps[0].Text()
}
}
// Hourly forecast and details
// Structure: section > div > [ul(hourly items), div(humidity/wind)]
// Use :has(> ul) to find the div containing the hourly list, regardless of
// position. This avoids breaking when advisory banners insert extra divs.
hourlyContainer := section.SelectFirst("div:has(> ul)")
if hourlyContainer != nil {
_ = hourlyContainer.ForEach("ul > li", func(n extractor.Node) error {
var hour HourlyForecast
hour.Precipitation = -1
// Each li contains: p(time), img(icon), [span(precip)], p(temp)
ps := n.Select("p")
if len(ps) >= 2 {
hour.Time, _ = ps[0].Text()
txt, _ := ps[len(ps)-1].Text()
hour.Temp = parse.NumericOnly(txt)
}
// Icon hint and condition from the weather icon's alt attribute
icons := n.Select("img[src*='weatherkit']:not([src*='Precipitation'])")
hour.IconHint = extractIconHint(icons)
hour.Condition = hour.IconHint
// Precipitation percentage is in a span > span structure
if precip := n.SelectFirst("span > span"); precip != nil {
txt, _ := precip.Text()
hour.Precipitation = int(parse.NumericOnly(txt))
}
data.Hourly = append(data.Hourly, hour)
return nil
})
// Use first hourly temperature as current temp (no standalone current temp element)
if len(data.Hourly) > 0 {
data.CurrentTemp = data.Hourly[0].Temp
}
// Humidity and wind from the details div (after the hourly ul)
details := hourlyContainer.Select("div > p")
for _, p := range details {
txt, _ := p.Text()
if strings.Contains(txt, "Humidity") {
if strong := p.SelectFirst("strong"); strong != nil {
data.Humidity, _ = strong.Text()
}
} else if strings.Contains(txt, "Wind") {
if strong := p.SelectFirst("strong"); strong != nil {
data.Wind, _ = strong.Text()
}
}
}
}
// Daily forecast
// Structure: section > ul > div (each day)
// The daily ul has div children; the hourly ul has li children, so ul > div is unambiguous.
_ = section.ForEach("ul > div", func(n extractor.Node) error {
var day DayForecast
day.Precipitation = -1
// Day name from first p
if d := n.SelectFirst("p:first-child"); d != nil {
day.Day, _ = d.Text()
}
// Icon hint and condition from the weather icon's alt attribute
icons := n.Select("img[src*='weatherkit']:not([src*='Precipitation'])")
day.IconHint = extractIconHint(icons)
day.Condition = day.IconHint
// High/low temps from last p's spans
if temps := n.SelectFirst("p:last-of-type"); temps != nil {
spans := temps.Select("span")
if len(spans) >= 2 {
highTxt, _ := spans[0].Text()
day.HighTemp = parse.NumericOnly(highTxt)
lowTxt, _ := spans[1].Text()
day.LowTemp = parse.NumericOnly(lowTxt)
}
}
// Precipitation percentage is in a span > span structure
if precip := n.SelectFirst("span > span"); precip != nil {
txt, _ := precip.Text()
day.Precipitation = int(parse.NumericOnly(txt))
}
data.Forecast = append(data.Forecast, day)
return nil
})
// Today's high/low from first daily forecast entry
if len(data.Forecast) > 0 {
data.HighTemp = data.Forecast[0].HighTemp
data.LowTemp = data.Forecast[0].LowTemp
}
return &data, nil
}
// extractIconHint reads the icon type from an element's aria-label, title, or alt attribute.
func extractIconHint(nodes extractor.Nodes) string {
if len(nodes) == 0 {
return ""
}
n := nodes[0]
for _, attr := range []string{"aria-label", "title", "alt"} {
v, _ := n.Attr(attr)
if v != "" {
return v
}
}
return ""
}