package imdb import ( "context" "encoding/json" "fmt" "log/slog" "strconv" "strings" "time" "gitea.stevedudenhoeffer.com/steve/go-extractor" ) // Movie holds structured movie/TV information from IMDB. type Movie struct { ID string // e.g. "tt1234567" Title string Year int Rating float64 // IMDB score (0-10) Votes int Runtime string Genres []string Director string Cast []string Plot string PosterURL string BoxOffice string // e.g. "$200,000,000" } // SearchResult holds a search result entry from IMDB. type SearchResult struct { ID string Title string Year int } // Config holds configuration for the IMDB extractor. type Config struct{} // DefaultConfig is the default IMDB configuration. var DefaultConfig = Config{} func (c Config) validate() Config { return c } // titleURL returns the IMDB URL for a given title ID. func titleURL(id string) string { return fmt.Sprintf("https://www.imdb.com/title/%s/", id) } // findURL returns the IMDB search URL for a given query. func findURL(query string) string { return fmt.Sprintf("https://www.imdb.com/find/?q=%s&s=tt&ttype=ft", strings.ReplaceAll(query, " ", "+")) } // GetMovie extracts structured movie data from an IMDB title page. func (c Config) GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) { c = c.validate() u := titleURL(id) slog.Info("fetching imdb title", "url", u, "id", id) doc, err := b.Open(ctx, u, extractor.OpenPageOptions{}) if err != nil { return nil, fmt.Errorf("failed to open imdb page: %w", err) } defer extractor.DeferClose(doc) timeout := 10 * time.Second if err := doc.WaitForNetworkIdle(&timeout); err != nil { slog.Warn("WaitForNetworkIdle failed", "err", err) } return extractMovie(doc, id) } // GetMovie is a convenience function using DefaultConfig. func GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) { return DefaultConfig.GetMovie(ctx, b, id) } // Search searches IMDB for titles matching the query. func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) { c = c.validate() u := findURL(query) slog.Info("searching imdb", "url", u, "query", query) doc, err := b.Open(ctx, u, extractor.OpenPageOptions{}) if err != nil { return nil, fmt.Errorf("failed to open imdb search: %w", err) } defer extractor.DeferClose(doc) timeout := 10 * time.Second if err := doc.WaitForNetworkIdle(&timeout); err != nil { slog.Warn("WaitForNetworkIdle failed", "err", err) } return extractSearchResults(doc) } // Search is a convenience function using DefaultConfig. func Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) { return DefaultConfig.Search(ctx, b, query) } // jsonLDMovie represents the JSON-LD Movie schema from IMDB. type jsonLDMovie struct { Type string `json:"@type"` Name string `json:"name"` Description string `json:"description"` DatePublished string `json:"datePublished"` Image interface{} `json:"image"` Duration string `json:"duration"` Genre interface{} `json:"genre"` Director interface{} `json:"director"` Actor interface{} `json:"actor"` Rating *struct { RatingValue interface{} `json:"ratingValue"` RatingCount interface{} `json:"ratingCount"` } `json:"aggregateRating"` } func extractMovie(doc extractor.Node, id string) (*Movie, error) { m := &Movie{ID: id} // Try JSON-LD first (most reliable on IMDB) scripts := doc.Select("script[type='application/ld+json']") for _, script := range scripts { txt, err := script.Text() if err != nil { continue } var jm jsonLDMovie if err := json.Unmarshal([]byte(txt), &jm); err != nil { continue } if jm.Type == "Movie" || jm.Type == "TVSeries" || jm.Type == "TVMiniSeries" { m.Title = jm.Name m.Plot = jm.Description m.Runtime = formatDuration(jm.Duration) m.PosterURL = extractImage(jm.Image) m.Genres = extractGenres(jm.Genre) m.Director = extractPerson(jm.Director) m.Cast = extractPersonList(jm.Actor) if jm.DatePublished != "" { m.Year = extractYear(jm.DatePublished) } if jm.Rating != nil { m.Rating = toFloat(jm.Rating.RatingValue) m.Votes = toInt(jm.Rating.RatingCount) } break } } // Box office from DOM (not in JSON-LD) _ = doc.ForEach("li[data-testid='title-boxoffice-section'] span", func(n extractor.Node) error { txt, _ := n.Text() if strings.HasPrefix(txt, "$") { m.BoxOffice = txt } return nil }) // Fallback: title from DOM if JSON-LD didn't work if m.Title == "" { titles := doc.Select("h1[data-testid='hero__pageTitle'] span") if len(titles) == 0 { titles = doc.Select("h1") } if len(titles) > 0 { m.Title, _ = titles[0].Text() m.Title = strings.TrimSpace(m.Title) } } return m, nil } func extractSearchResults(doc extractor.Node) ([]SearchResult, error) { var results []SearchResult _ = doc.ForEach("li.find-result-item", func(n extractor.Node) error { var sr SearchResult // Title link contains the ID in href links := n.Select("a.ipc-metadata-list-summary-item__t") if len(links) > 0 { sr.Title, _ = links[0].Text() sr.Title = strings.TrimSpace(sr.Title) href, _ := links[0].Attr("href") sr.ID = extractTitleID(href) } // Year years := n.Select("span.ipc-metadata-list-summary-item__li") if len(years) > 0 { txt, _ := years[0].Text() sr.Year = extractYear(txt) } if sr.Title != "" { results = append(results, sr) } return nil }) return results, nil } // extractTitleID extracts "tt1234567" from an IMDB URL path like "/title/tt1234567/". func extractTitleID(href string) string { parts := strings.Split(href, "/") for _, p := range parts { if strings.HasPrefix(p, "tt") { return p } } return "" } func extractImage(v interface{}) string { switch img := v.(type) { case string: return img case map[string]interface{}: if url, ok := img["url"].(string); ok { return url } } return "" } func extractGenres(v interface{}) []string { switch g := v.(type) { case string: return []string{g} case []interface{}: var genres []string for _, item := range g { if s, ok := item.(string); ok { genres = append(genres, s) } } return genres } return nil } func extractPerson(v interface{}) string { switch p := v.(type) { case map[string]interface{}: if name, ok := p["name"].(string); ok { return name } case []interface{}: if len(p) > 0 { return extractPerson(p[0]) } } return "" } func extractPersonList(v interface{}) []string { switch a := v.(type) { case []interface{}: var people []string for _, item := range a { if name := extractPerson(item); name != "" { people = append(people, name) } } return people case map[string]interface{}: if name := extractPerson(a); name != "" { return []string{name} } } return nil } func extractYear(s string) int { // Try to find a 4-digit year in the string for i := 0; i <= len(s)-4; i++ { if s[i] >= '0' && s[i] <= '9' { chunk := s[i : i+4] if y, err := strconv.Atoi(chunk); err == nil && y >= 1888 && y <= 2100 { return y } } } return 0 } // formatDuration converts ISO 8601 duration (PT2H30M) to human-readable form. func formatDuration(iso string) string { if iso == "" { return "" } iso = strings.TrimPrefix(iso, "PT") iso = strings.TrimPrefix(iso, "pt") if iso == "" { return "" } var parts []string var num string for _, c := range iso { switch { case c >= '0' && c <= '9': num += string(c) case c == 'H' || c == 'h': if num != "" { parts = append(parts, num+"h") num = "" } case c == 'M' || c == 'm': if num != "" { parts = append(parts, num+"m") num = "" } } } if len(parts) == 0 { return iso } return strings.Join(parts, " ") } func toFloat(v interface{}) float64 { switch f := v.(type) { case float64: return f case string: var val float64 fmt.Sscanf(f, "%f", &val) return val } return 0 } func toInt(v interface{}) int { switch i := v.(type) { case float64: return int(i) case string: val, _ := strconv.Atoi(strings.ReplaceAll(i, ",", "")) return val } return 0 }