feature: add IMDB movie/TV extractor
Add sites/imdb package with GetMovie() and Search() methods. Extracts title, year, rating, votes, runtime, genres, director, cast, plot, poster, and box office data. Uses JSON-LD parsing with DOM fallback. Supports Movie, TVSeries, and TVMiniSeries types. Closes #30 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
366
sites/imdb/imdb.go
Normal file
366
sites/imdb/imdb.go
Normal file
@@ -0,0 +1,366 @@
|
||||
package imdb
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
)
|
||||
|
||||
// Movie holds structured movie/TV information from IMDB.
|
||||
type Movie struct {
|
||||
ID string // e.g. "tt1234567"
|
||||
Title string
|
||||
Year int
|
||||
Rating float64 // IMDB score (0-10)
|
||||
Votes int
|
||||
Runtime string
|
||||
Genres []string
|
||||
Director string
|
||||
Cast []string
|
||||
Plot string
|
||||
PosterURL string
|
||||
BoxOffice string // e.g. "$200,000,000"
|
||||
}
|
||||
|
||||
// SearchResult holds a search result entry from IMDB.
|
||||
type SearchResult struct {
|
||||
ID string
|
||||
Title string
|
||||
Year int
|
||||
}
|
||||
|
||||
// Config holds configuration for the IMDB extractor.
|
||||
type Config struct{}
|
||||
|
||||
// DefaultConfig is the default IMDB configuration.
|
||||
var DefaultConfig = Config{}
|
||||
|
||||
func (c Config) validate() Config {
|
||||
return c
|
||||
}
|
||||
|
||||
// titleURL returns the IMDB URL for a given title ID.
|
||||
func titleURL(id string) string {
|
||||
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
|
||||
}
|
||||
|
||||
// findURL returns the IMDB search URL for a given query.
|
||||
func findURL(query string) string {
|
||||
return fmt.Sprintf("https://www.imdb.com/find/?q=%s&s=tt&ttype=ft", strings.ReplaceAll(query, " ", "+"))
|
||||
}
|
||||
|
||||
// GetMovie extracts structured movie data from an IMDB title page.
|
||||
func (c Config) GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) {
|
||||
c = c.validate()
|
||||
|
||||
u := titleURL(id)
|
||||
|
||||
slog.Info("fetching imdb title", "url", u, "id", id)
|
||||
doc, err := b.Open(ctx, u, extractor.OpenPageOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open imdb page: %w", err)
|
||||
}
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
timeout := 10 * time.Second
|
||||
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
|
||||
slog.Warn("WaitForNetworkIdle failed", "err", err)
|
||||
}
|
||||
|
||||
return extractMovie(doc, id)
|
||||
}
|
||||
|
||||
// GetMovie is a convenience function using DefaultConfig.
|
||||
func GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) {
|
||||
return DefaultConfig.GetMovie(ctx, b, id)
|
||||
}
|
||||
|
||||
// Search searches IMDB for titles matching the query.
|
||||
func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) {
|
||||
c = c.validate()
|
||||
|
||||
u := findURL(query)
|
||||
|
||||
slog.Info("searching imdb", "url", u, "query", query)
|
||||
doc, err := b.Open(ctx, u, extractor.OpenPageOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open imdb search: %w", err)
|
||||
}
|
||||
defer extractor.DeferClose(doc)
|
||||
|
||||
timeout := 10 * time.Second
|
||||
if err := doc.WaitForNetworkIdle(&timeout); err != nil {
|
||||
slog.Warn("WaitForNetworkIdle failed", "err", err)
|
||||
}
|
||||
|
||||
return extractSearchResults(doc)
|
||||
}
|
||||
|
||||
// Search is a convenience function using DefaultConfig.
|
||||
func Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) {
|
||||
return DefaultConfig.Search(ctx, b, query)
|
||||
}
|
||||
|
||||
// jsonLDMovie represents the JSON-LD Movie schema from IMDB.
|
||||
type jsonLDMovie struct {
|
||||
Type string `json:"@type"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
DatePublished string `json:"datePublished"`
|
||||
Image interface{} `json:"image"`
|
||||
Duration string `json:"duration"`
|
||||
Genre interface{} `json:"genre"`
|
||||
Director interface{} `json:"director"`
|
||||
Actor interface{} `json:"actor"`
|
||||
Rating *struct {
|
||||
RatingValue interface{} `json:"ratingValue"`
|
||||
RatingCount interface{} `json:"ratingCount"`
|
||||
} `json:"aggregateRating"`
|
||||
}
|
||||
|
||||
func extractMovie(doc extractor.Node, id string) (*Movie, error) {
|
||||
m := &Movie{ID: id}
|
||||
|
||||
// Try JSON-LD first (most reliable on IMDB)
|
||||
scripts := doc.Select("script[type='application/ld+json']")
|
||||
for _, script := range scripts {
|
||||
txt, err := script.Text()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var jm jsonLDMovie
|
||||
if err := json.Unmarshal([]byte(txt), &jm); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if jm.Type == "Movie" || jm.Type == "TVSeries" || jm.Type == "TVMiniSeries" {
|
||||
m.Title = jm.Name
|
||||
m.Plot = jm.Description
|
||||
m.Runtime = formatDuration(jm.Duration)
|
||||
m.PosterURL = extractImage(jm.Image)
|
||||
m.Genres = extractGenres(jm.Genre)
|
||||
m.Director = extractPerson(jm.Director)
|
||||
m.Cast = extractPersonList(jm.Actor)
|
||||
|
||||
if jm.DatePublished != "" {
|
||||
m.Year = extractYear(jm.DatePublished)
|
||||
}
|
||||
|
||||
if jm.Rating != nil {
|
||||
m.Rating = toFloat(jm.Rating.RatingValue)
|
||||
m.Votes = toInt(jm.Rating.RatingCount)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Box office from DOM (not in JSON-LD)
|
||||
_ = doc.ForEach("li[data-testid='title-boxoffice-section'] span", func(n extractor.Node) error {
|
||||
txt, _ := n.Text()
|
||||
if strings.HasPrefix(txt, "$") {
|
||||
m.BoxOffice = txt
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Fallback: title from DOM if JSON-LD didn't work
|
||||
if m.Title == "" {
|
||||
titles := doc.Select("h1[data-testid='hero__pageTitle'] span")
|
||||
if len(titles) == 0 {
|
||||
titles = doc.Select("h1")
|
||||
}
|
||||
if len(titles) > 0 {
|
||||
m.Title, _ = titles[0].Text()
|
||||
m.Title = strings.TrimSpace(m.Title)
|
||||
}
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func extractSearchResults(doc extractor.Node) ([]SearchResult, error) {
|
||||
var results []SearchResult
|
||||
|
||||
_ = doc.ForEach("li.find-result-item", func(n extractor.Node) error {
|
||||
var sr SearchResult
|
||||
|
||||
// Title link contains the ID in href
|
||||
links := n.Select("a.ipc-metadata-list-summary-item__t")
|
||||
if len(links) > 0 {
|
||||
sr.Title, _ = links[0].Text()
|
||||
sr.Title = strings.TrimSpace(sr.Title)
|
||||
|
||||
href, _ := links[0].Attr("href")
|
||||
sr.ID = extractTitleID(href)
|
||||
}
|
||||
|
||||
// Year
|
||||
years := n.Select("span.ipc-metadata-list-summary-item__li")
|
||||
if len(years) > 0 {
|
||||
txt, _ := years[0].Text()
|
||||
sr.Year = extractYear(txt)
|
||||
}
|
||||
|
||||
if sr.Title != "" {
|
||||
results = append(results, sr)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// extractTitleID extracts "tt1234567" from an IMDB URL path like "/title/tt1234567/".
|
||||
func extractTitleID(href string) string {
|
||||
parts := strings.Split(href, "/")
|
||||
for _, p := range parts {
|
||||
if strings.HasPrefix(p, "tt") {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractImage(v interface{}) string {
|
||||
switch img := v.(type) {
|
||||
case string:
|
||||
return img
|
||||
case map[string]interface{}:
|
||||
if url, ok := img["url"].(string); ok {
|
||||
return url
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractGenres(v interface{}) []string {
|
||||
switch g := v.(type) {
|
||||
case string:
|
||||
return []string{g}
|
||||
case []interface{}:
|
||||
var genres []string
|
||||
for _, item := range g {
|
||||
if s, ok := item.(string); ok {
|
||||
genres = append(genres, s)
|
||||
}
|
||||
}
|
||||
return genres
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractPerson(v interface{}) string {
|
||||
switch p := v.(type) {
|
||||
case map[string]interface{}:
|
||||
if name, ok := p["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
case []interface{}:
|
||||
if len(p) > 0 {
|
||||
return extractPerson(p[0])
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractPersonList(v interface{}) []string {
|
||||
switch a := v.(type) {
|
||||
case []interface{}:
|
||||
var people []string
|
||||
for _, item := range a {
|
||||
if name := extractPerson(item); name != "" {
|
||||
people = append(people, name)
|
||||
}
|
||||
}
|
||||
return people
|
||||
case map[string]interface{}:
|
||||
if name := extractPerson(a); name != "" {
|
||||
return []string{name}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractYear(s string) int {
|
||||
// Try to find a 4-digit year in the string
|
||||
for i := 0; i <= len(s)-4; i++ {
|
||||
if s[i] >= '0' && s[i] <= '9' {
|
||||
chunk := s[i : i+4]
|
||||
if y, err := strconv.Atoi(chunk); err == nil && y >= 1888 && y <= 2100 {
|
||||
return y
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// formatDuration converts ISO 8601 duration (PT2H30M) to human-readable form.
|
||||
func formatDuration(iso string) string {
|
||||
if iso == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
iso = strings.TrimPrefix(iso, "PT")
|
||||
iso = strings.TrimPrefix(iso, "pt")
|
||||
|
||||
if iso == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
var parts []string
|
||||
var num string
|
||||
|
||||
for _, c := range iso {
|
||||
switch {
|
||||
case c >= '0' && c <= '9':
|
||||
num += string(c)
|
||||
case c == 'H' || c == 'h':
|
||||
if num != "" {
|
||||
parts = append(parts, num+"h")
|
||||
num = ""
|
||||
}
|
||||
case c == 'M' || c == 'm':
|
||||
if num != "" {
|
||||
parts = append(parts, num+"m")
|
||||
num = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(parts) == 0 {
|
||||
return iso
|
||||
}
|
||||
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func toFloat(v interface{}) float64 {
|
||||
switch f := v.(type) {
|
||||
case float64:
|
||||
return f
|
||||
case string:
|
||||
var val float64
|
||||
fmt.Sscanf(f, "%f", &val)
|
||||
return val
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func toInt(v interface{}) int {
|
||||
switch i := v.(type) {
|
||||
case float64:
|
||||
return int(i)
|
||||
case string:
|
||||
val, _ := strconv.Atoi(strings.ReplaceAll(i, ",", ""))
|
||||
return val
|
||||
}
|
||||
return 0
|
||||
}
|
||||
Reference in New Issue
Block a user