From c2768e2b05118f55777db3f58c282ca9a0a42603 Mon Sep 17 00:00:00 2001 From: Steve Dudenhoeffer Date: Sun, 15 Feb 2026 16:54:30 +0000 Subject: [PATCH] feature: add IMDB movie/TV extractor Add sites/imdb package with GetMovie() and Search() methods. Extracts title, year, rating, votes, runtime, genres, director, cast, plot, poster, and box office data. Uses JSON-LD parsing with DOM fallback. Supports Movie, TVSeries, and TVMiniSeries types. Closes #30 Co-Authored-By: Claude Opus 4.6 --- sites/imdb/imdb.go | 366 ++++++++++++++++++++++++++++++++++++++++ sites/imdb/imdb_test.go | 331 ++++++++++++++++++++++++++++++++++++ 2 files changed, 697 insertions(+) create mode 100644 sites/imdb/imdb.go create mode 100644 sites/imdb/imdb_test.go diff --git a/sites/imdb/imdb.go b/sites/imdb/imdb.go new file mode 100644 index 0000000..34bed9d --- /dev/null +++ b/sites/imdb/imdb.go @@ -0,0 +1,366 @@ +package imdb + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "strconv" + "strings" + "time" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" +) + +// Movie holds structured movie/TV information from IMDB. +type Movie struct { + ID string // e.g. "tt1234567" + Title string + Year int + Rating float64 // IMDB score (0-10) + Votes int + Runtime string + Genres []string + Director string + Cast []string + Plot string + PosterURL string + BoxOffice string // e.g. "$200,000,000" +} + +// SearchResult holds a search result entry from IMDB. +type SearchResult struct { + ID string + Title string + Year int +} + +// Config holds configuration for the IMDB extractor. +type Config struct{} + +// DefaultConfig is the default IMDB configuration. +var DefaultConfig = Config{} + +func (c Config) validate() Config { + return c +} + +// titleURL returns the IMDB URL for a given title ID. +func titleURL(id string) string { + return fmt.Sprintf("https://www.imdb.com/title/%s/", id) +} + +// findURL returns the IMDB search URL for a given query. +func findURL(query string) string { + return fmt.Sprintf("https://www.imdb.com/find/?q=%s&s=tt&ttype=ft", strings.ReplaceAll(query, " ", "+")) +} + +// GetMovie extracts structured movie data from an IMDB title page. +func (c Config) GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) { + c = c.validate() + + u := titleURL(id) + + slog.Info("fetching imdb title", "url", u, "id", id) + doc, err := b.Open(ctx, u, extractor.OpenPageOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to open imdb page: %w", err) + } + defer extractor.DeferClose(doc) + + timeout := 10 * time.Second + if err := doc.WaitForNetworkIdle(&timeout); err != nil { + slog.Warn("WaitForNetworkIdle failed", "err", err) + } + + return extractMovie(doc, id) +} + +// GetMovie is a convenience function using DefaultConfig. +func GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) { + return DefaultConfig.GetMovie(ctx, b, id) +} + +// Search searches IMDB for titles matching the query. +func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) { + c = c.validate() + + u := findURL(query) + + slog.Info("searching imdb", "url", u, "query", query) + doc, err := b.Open(ctx, u, extractor.OpenPageOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to open imdb search: %w", err) + } + defer extractor.DeferClose(doc) + + timeout := 10 * time.Second + if err := doc.WaitForNetworkIdle(&timeout); err != nil { + slog.Warn("WaitForNetworkIdle failed", "err", err) + } + + return extractSearchResults(doc) +} + +// Search is a convenience function using DefaultConfig. +func Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) { + return DefaultConfig.Search(ctx, b, query) +} + +// jsonLDMovie represents the JSON-LD Movie schema from IMDB. +type jsonLDMovie struct { + Type string `json:"@type"` + Name string `json:"name"` + Description string `json:"description"` + DatePublished string `json:"datePublished"` + Image interface{} `json:"image"` + Duration string `json:"duration"` + Genre interface{} `json:"genre"` + Director interface{} `json:"director"` + Actor interface{} `json:"actor"` + Rating *struct { + RatingValue interface{} `json:"ratingValue"` + RatingCount interface{} `json:"ratingCount"` + } `json:"aggregateRating"` +} + +func extractMovie(doc extractor.Node, id string) (*Movie, error) { + m := &Movie{ID: id} + + // Try JSON-LD first (most reliable on IMDB) + scripts := doc.Select("script[type='application/ld+json']") + for _, script := range scripts { + txt, err := script.Text() + if err != nil { + continue + } + + var jm jsonLDMovie + if err := json.Unmarshal([]byte(txt), &jm); err != nil { + continue + } + + if jm.Type == "Movie" || jm.Type == "TVSeries" || jm.Type == "TVMiniSeries" { + m.Title = jm.Name + m.Plot = jm.Description + m.Runtime = formatDuration(jm.Duration) + m.PosterURL = extractImage(jm.Image) + m.Genres = extractGenres(jm.Genre) + m.Director = extractPerson(jm.Director) + m.Cast = extractPersonList(jm.Actor) + + if jm.DatePublished != "" { + m.Year = extractYear(jm.DatePublished) + } + + if jm.Rating != nil { + m.Rating = toFloat(jm.Rating.RatingValue) + m.Votes = toInt(jm.Rating.RatingCount) + } + + break + } + } + + // Box office from DOM (not in JSON-LD) + _ = doc.ForEach("li[data-testid='title-boxoffice-section'] span", func(n extractor.Node) error { + txt, _ := n.Text() + if strings.HasPrefix(txt, "$") { + m.BoxOffice = txt + } + return nil + }) + + // Fallback: title from DOM if JSON-LD didn't work + if m.Title == "" { + titles := doc.Select("h1[data-testid='hero__pageTitle'] span") + if len(titles) == 0 { + titles = doc.Select("h1") + } + if len(titles) > 0 { + m.Title, _ = titles[0].Text() + m.Title = strings.TrimSpace(m.Title) + } + } + + return m, nil +} + +func extractSearchResults(doc extractor.Node) ([]SearchResult, error) { + var results []SearchResult + + _ = doc.ForEach("li.find-result-item", func(n extractor.Node) error { + var sr SearchResult + + // Title link contains the ID in href + links := n.Select("a.ipc-metadata-list-summary-item__t") + if len(links) > 0 { + sr.Title, _ = links[0].Text() + sr.Title = strings.TrimSpace(sr.Title) + + href, _ := links[0].Attr("href") + sr.ID = extractTitleID(href) + } + + // Year + years := n.Select("span.ipc-metadata-list-summary-item__li") + if len(years) > 0 { + txt, _ := years[0].Text() + sr.Year = extractYear(txt) + } + + if sr.Title != "" { + results = append(results, sr) + } + return nil + }) + + return results, nil +} + +// extractTitleID extracts "tt1234567" from an IMDB URL path like "/title/tt1234567/". +func extractTitleID(href string) string { + parts := strings.Split(href, "/") + for _, p := range parts { + if strings.HasPrefix(p, "tt") { + return p + } + } + return "" +} + +func extractImage(v interface{}) string { + switch img := v.(type) { + case string: + return img + case map[string]interface{}: + if url, ok := img["url"].(string); ok { + return url + } + } + return "" +} + +func extractGenres(v interface{}) []string { + switch g := v.(type) { + case string: + return []string{g} + case []interface{}: + var genres []string + for _, item := range g { + if s, ok := item.(string); ok { + genres = append(genres, s) + } + } + return genres + } + return nil +} + +func extractPerson(v interface{}) string { + switch p := v.(type) { + case map[string]interface{}: + if name, ok := p["name"].(string); ok { + return name + } + case []interface{}: + if len(p) > 0 { + return extractPerson(p[0]) + } + } + return "" +} + +func extractPersonList(v interface{}) []string { + switch a := v.(type) { + case []interface{}: + var people []string + for _, item := range a { + if name := extractPerson(item); name != "" { + people = append(people, name) + } + } + return people + case map[string]interface{}: + if name := extractPerson(a); name != "" { + return []string{name} + } + } + return nil +} + +func extractYear(s string) int { + // Try to find a 4-digit year in the string + for i := 0; i <= len(s)-4; i++ { + if s[i] >= '0' && s[i] <= '9' { + chunk := s[i : i+4] + if y, err := strconv.Atoi(chunk); err == nil && y >= 1888 && y <= 2100 { + return y + } + } + } + return 0 +} + +// formatDuration converts ISO 8601 duration (PT2H30M) to human-readable form. +func formatDuration(iso string) string { + if iso == "" { + return "" + } + + iso = strings.TrimPrefix(iso, "PT") + iso = strings.TrimPrefix(iso, "pt") + + if iso == "" { + return "" + } + + var parts []string + var num string + + for _, c := range iso { + switch { + case c >= '0' && c <= '9': + num += string(c) + case c == 'H' || c == 'h': + if num != "" { + parts = append(parts, num+"h") + num = "" + } + case c == 'M' || c == 'm': + if num != "" { + parts = append(parts, num+"m") + num = "" + } + } + } + + if len(parts) == 0 { + return iso + } + + return strings.Join(parts, " ") +} + +func toFloat(v interface{}) float64 { + switch f := v.(type) { + case float64: + return f + case string: + var val float64 + fmt.Sscanf(f, "%f", &val) + return val + } + return 0 +} + +func toInt(v interface{}) int { + switch i := v.(type) { + case float64: + return int(i) + case string: + val, _ := strconv.Atoi(strings.ReplaceAll(i, ",", "")) + return val + } + return 0 +} diff --git a/sites/imdb/imdb_test.go b/sites/imdb/imdb_test.go new file mode 100644 index 0000000..01cd846 --- /dev/null +++ b/sites/imdb/imdb_test.go @@ -0,0 +1,331 @@ +package imdb + +import ( + "context" + "testing" + + "gitea.stevedudenhoeffer.com/steve/go-extractor" + "gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest" +) + +const sampleMovieJSONLD = `{ + "@type": "Movie", + "name": "The Shawshank Redemption", + "description": "Two imprisoned men bond over a number of years.", + "datePublished": "1994-10-14", + "image": "https://example.com/shawshank.jpg", + "duration": "PT2H22M", + "genre": ["Drama"], + "director": {"@type": "Person", "name": "Frank Darabont"}, + "actor": [ + {"@type": "Person", "name": "Tim Robbins"}, + {"@type": "Person", "name": "Morgan Freeman"} + ], + "aggregateRating": { + "ratingValue": "9.3", + "ratingCount": "2800000" + } +}` + +const sampleTVSeriesJSONLD = `{ + "@type": "TVSeries", + "name": "Breaking Bad", + "description": "A chemistry teacher turned drug lord.", + "datePublished": "2008-01-20", + "duration": "PT49M", + "genre": ["Crime", "Drama", "Thriller"], + "director": {"@type": "Person", "name": "Vince Gilligan"}, + "actor": [ + {"@type": "Person", "name": "Bryan Cranston"}, + {"@type": "Person", "name": "Aaron Paul"} + ], + "aggregateRating": { + "ratingValue": 9.5, + "ratingCount": 2100000 + } +}` + +func makeMovieDoc() *extractortest.MockDocument { + return &extractortest.MockDocument{ + URLValue: "https://www.imdb.com/title/tt0111161/", + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "script[type='application/ld+json']": { + &extractortest.MockNode{TextValue: sampleMovieJSONLD}, + }, + "li[data-testid='title-boxoffice-section'] span": { + &extractortest.MockNode{TextValue: "Gross worldwide"}, + &extractortest.MockNode{TextValue: "$58,300,000"}, + }, + }, + }, + } +} + +func TestExtractMovie(t *testing.T) { + doc := makeMovieDoc() + + m, err := extractMovie(doc, "tt0111161") + if err != nil { + t.Fatalf("extractMovie() error: %v", err) + } + + if m.ID != "tt0111161" { + t.Errorf("ID = %q, want %q", m.ID, "tt0111161") + } + if m.Title != "The Shawshank Redemption" { + t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption") + } + if m.Year != 1994 { + t.Errorf("Year = %d, want 1994", m.Year) + } + if m.Rating != 9.3 { + t.Errorf("Rating = %v, want 9.3", m.Rating) + } + if m.Votes != 2800000 { + t.Errorf("Votes = %d, want 2800000", m.Votes) + } + if m.Runtime != "2h 22m" { + t.Errorf("Runtime = %q, want %q", m.Runtime, "2h 22m") + } + if len(m.Genres) != 1 || m.Genres[0] != "Drama" { + t.Errorf("Genres = %v, want [Drama]", m.Genres) + } + if m.Director != "Frank Darabont" { + t.Errorf("Director = %q, want %q", m.Director, "Frank Darabont") + } + if len(m.Cast) != 2 { + t.Fatalf("len(Cast) = %d, want 2", len(m.Cast)) + } + if m.Cast[0] != "Tim Robbins" { + t.Errorf("Cast[0] = %q, want %q", m.Cast[0], "Tim Robbins") + } + if m.Plot != "Two imprisoned men bond over a number of years." { + t.Errorf("Plot = %q, want correct value", m.Plot) + } + if m.PosterURL != "https://example.com/shawshank.jpg" { + t.Errorf("PosterURL = %q, want %q", m.PosterURL, "https://example.com/shawshank.jpg") + } + if m.BoxOffice != "$58,300,000" { + t.Errorf("BoxOffice = %q, want %q", m.BoxOffice, "$58,300,000") + } +} + +func TestExtractMovie_TVSeries(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "script[type='application/ld+json']": { + &extractortest.MockNode{TextValue: sampleTVSeriesJSONLD}, + }, + }, + }, + } + + m, err := extractMovie(doc, "tt0903747") + if err != nil { + t.Fatalf("extractMovie() error: %v", err) + } + + if m.Title != "Breaking Bad" { + t.Errorf("Title = %q, want %q", m.Title, "Breaking Bad") + } + if m.Year != 2008 { + t.Errorf("Year = %d, want 2008", m.Year) + } + if m.Rating != 9.5 { + t.Errorf("Rating = %v, want 9.5", m.Rating) + } + if m.Votes != 2100000 { + t.Errorf("Votes = %d, want 2100000", m.Votes) + } + if len(m.Genres) != 3 { + t.Errorf("len(Genres) = %d, want 3", len(m.Genres)) + } + if m.Runtime != "49m" { + t.Errorf("Runtime = %q, want %q", m.Runtime, "49m") + } +} + +func TestExtractMovie_DOMFallback(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "h1[data-testid='hero__pageTitle'] span": { + &extractortest.MockNode{TextValue: "Inception"}, + }, + }, + }, + } + + m, err := extractMovie(doc, "tt1375666") + if err != nil { + t.Fatalf("extractMovie() error: %v", err) + } + + if m.Title != "Inception" { + t.Errorf("Title = %q, want %q", m.Title, "Inception") + } +} + +func TestExtractSearchResults(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "li.find-result-item": { + &extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "a.ipc-metadata-list-summary-item__t": { + &extractortest.MockNode{ + TextValue: "The Matrix", + Attrs: map[string]string{"href": "/title/tt0133093/"}, + }, + }, + "span.ipc-metadata-list-summary-item__li": { + &extractortest.MockNode{TextValue: "1999"}, + }, + }, + }, + &extractortest.MockNode{ + Children: map[string]extractor.Nodes{ + "a.ipc-metadata-list-summary-item__t": { + &extractortest.MockNode{ + TextValue: "The Matrix Reloaded", + Attrs: map[string]string{"href": "/title/tt0234215/"}, + }, + }, + "span.ipc-metadata-list-summary-item__li": { + &extractortest.MockNode{TextValue: "2003"}, + }, + }, + }, + }, + }, + }, + } + + results, err := extractSearchResults(doc) + if err != nil { + t.Fatalf("extractSearchResults() error: %v", err) + } + + if len(results) != 2 { + t.Fatalf("len(results) = %d, want 2", len(results)) + } + + if results[0].Title != "The Matrix" { + t.Errorf("results[0].Title = %q, want %q", results[0].Title, "The Matrix") + } + if results[0].ID != "tt0133093" { + t.Errorf("results[0].ID = %q, want %q", results[0].ID, "tt0133093") + } + if results[0].Year != 1999 { + t.Errorf("results[0].Year = %d, want 1999", results[0].Year) + } + if results[1].Title != "The Matrix Reloaded" { + t.Errorf("results[1].Title = %q, want %q", results[1].Title, "The Matrix Reloaded") + } +} + +func TestGetMovie_MockBrowser(t *testing.T) { + doc := makeMovieDoc() + + browser := &extractortest.MockBrowser{ + Documents: map[string]*extractortest.MockDocument{ + "https://www.imdb.com/title/tt0111161/": doc, + }, + } + + m, err := DefaultConfig.GetMovie(context.Background(), browser, "tt0111161") + if err != nil { + t.Fatalf("GetMovie() error: %v", err) + } + + if m.Title != "The Shawshank Redemption" { + t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption") + } + if m.Rating != 9.3 { + t.Errorf("Rating = %v, want 9.3", m.Rating) + } +} + +func TestExtractMovie_Empty(t *testing.T) { + doc := &extractortest.MockDocument{ + MockNode: extractortest.MockNode{ + Children: map[string]extractor.Nodes{}, + }, + } + + m, err := extractMovie(doc, "tt0000000") + if err != nil { + t.Fatalf("extractMovie() error: %v", err) + } + + if m.Title != "" || m.Rating != 0 { + t.Error("expected zero values for empty doc") + } +} + +func TestExtractTitleID(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"/title/tt0111161/", "tt0111161"}, + {"/title/tt0133093/?ref_=fn_al_tt_1", "tt0133093"}, + {"/name/nm0000151/", ""}, + } + + for _, tt := range tests { + got := extractTitleID(tt.input) + if got != tt.want { + t.Errorf("extractTitleID(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +func TestExtractYear(t *testing.T) { + tests := []struct { + input string + want int + }{ + {"1994-10-14", 1994}, + {"2008", 2008}, + {"(1999)", 1999}, + {"no year here", 0}, + } + + for _, tt := range tests { + got := extractYear(tt.input) + if got != tt.want { + t.Errorf("extractYear(%q) = %d, want %d", tt.input, got, tt.want) + } + } +} + +func TestFormatDuration(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"PT2H22M", "2h 22m"}, + {"PT49M", "49m"}, + {"PT1H", "1h"}, + {"", ""}, + } + + for _, tt := range tests { + got := formatDuration(tt.input) + if got != tt.want { + t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +func TestTitleURL(t *testing.T) { + got := titleURL("tt0111161") + want := "https://www.imdb.com/title/tt0111161/" + if got != want { + t.Errorf("titleURL(\"tt0111161\") = %q, want %q", got, want) + } +} -- 2.49.1