From c2768e2b05118f55777db3f58c282ca9a0a42603 Mon Sep 17 00:00:00 2001
From: Steve Dudenhoeffer <steve@stevedudenhoeffer.com>
Date: Sun, 15 Feb 2026 16:54:30 +0000
Subject: [PATCH] feature: add IMDB movie/TV extractor

Add sites/imdb package with GetMovie() and Search() methods. Extracts
title, year, rating, votes, runtime, genres, director, cast, plot,
poster, and box office data. Uses JSON-LD parsing with DOM fallback.
Supports Movie, TVSeries, and TVMiniSeries types.

Closes #30

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 sites/imdb/imdb.go      | 366 ++++++++++++++++++++++++++++++++++++++++
 sites/imdb/imdb_test.go | 331 ++++++++++++++++++++++++++++++++++++
 2 files changed, 697 insertions(+)
 create mode 100644 sites/imdb/imdb.go
 create mode 100644 sites/imdb/imdb_test.go

diff --git a/sites/imdb/imdb.go b/sites/imdb/imdb.go
new file mode 100644
index 0000000..34bed9d
--- /dev/null
+++ b/sites/imdb/imdb.go
@@ -0,0 +1,366 @@
+package imdb
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"strconv"
+	"strings"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/go-extractor"
+)
+
+// Movie holds structured movie/TV information from IMDB.
+type Movie struct {
+	ID        string   // e.g. "tt1234567"
+	Title     string
+	Year      int
+	Rating    float64 // IMDB score (0-10)
+	Votes     int
+	Runtime   string
+	Genres    []string
+	Director  string
+	Cast      []string
+	Plot      string
+	PosterURL string
+	BoxOffice string // e.g. "$200,000,000"
+}
+
+// SearchResult holds a search result entry from IMDB.
+type SearchResult struct {
+	ID    string
+	Title string
+	Year  int
+}
+
+// Config holds configuration for the IMDB extractor.
+type Config struct{}
+
+// DefaultConfig is the default IMDB configuration.
+var DefaultConfig = Config{}
+
+func (c Config) validate() Config {
+	return c
+}
+
+// titleURL returns the IMDB URL for a given title ID.
+func titleURL(id string) string {
+	return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
+}
+
+// findURL returns the IMDB search URL for a given query.
+func findURL(query string) string {
+	return fmt.Sprintf("https://www.imdb.com/find/?q=%s&s=tt&ttype=ft", strings.ReplaceAll(query, " ", "+"))
+}
+
+// GetMovie extracts structured movie data from an IMDB title page.
+func (c Config) GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) {
+	c = c.validate()
+
+	u := titleURL(id)
+
+	slog.Info("fetching imdb title", "url", u, "id", id)
+	doc, err := b.Open(ctx, u, extractor.OpenPageOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to open imdb page: %w", err)
+	}
+	defer extractor.DeferClose(doc)
+
+	timeout := 10 * time.Second
+	if err := doc.WaitForNetworkIdle(&timeout); err != nil {
+		slog.Warn("WaitForNetworkIdle failed", "err", err)
+	}
+
+	return extractMovie(doc, id)
+}
+
+// GetMovie is a convenience function using DefaultConfig.
+func GetMovie(ctx context.Context, b extractor.Browser, id string) (*Movie, error) {
+	return DefaultConfig.GetMovie(ctx, b, id)
+}
+
+// Search searches IMDB for titles matching the query.
+func (c Config) Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) {
+	c = c.validate()
+
+	u := findURL(query)
+
+	slog.Info("searching imdb", "url", u, "query", query)
+	doc, err := b.Open(ctx, u, extractor.OpenPageOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to open imdb search: %w", err)
+	}
+	defer extractor.DeferClose(doc)
+
+	timeout := 10 * time.Second
+	if err := doc.WaitForNetworkIdle(&timeout); err != nil {
+		slog.Warn("WaitForNetworkIdle failed", "err", err)
+	}
+
+	return extractSearchResults(doc)
+}
+
+// Search is a convenience function using DefaultConfig.
+func Search(ctx context.Context, b extractor.Browser, query string) ([]SearchResult, error) {
+	return DefaultConfig.Search(ctx, b, query)
+}
+
+// jsonLDMovie represents the JSON-LD Movie schema from IMDB.
+type jsonLDMovie struct {
+	Type        string      `json:"@type"`
+	Name        string      `json:"name"`
+	Description string      `json:"description"`
+	DatePublished string    `json:"datePublished"`
+	Image       interface{} `json:"image"`
+	Duration    string      `json:"duration"`
+	Genre       interface{} `json:"genre"`
+	Director    interface{} `json:"director"`
+	Actor       interface{} `json:"actor"`
+	Rating      *struct {
+		RatingValue interface{} `json:"ratingValue"`
+		RatingCount interface{} `json:"ratingCount"`
+	} `json:"aggregateRating"`
+}
+
+func extractMovie(doc extractor.Node, id string) (*Movie, error) {
+	m := &Movie{ID: id}
+
+	// Try JSON-LD first (most reliable on IMDB)
+	scripts := doc.Select("script[type='application/ld+json']")
+	for _, script := range scripts {
+		txt, err := script.Text()
+		if err != nil {
+			continue
+		}
+
+		var jm jsonLDMovie
+		if err := json.Unmarshal([]byte(txt), &jm); err != nil {
+			continue
+		}
+
+		if jm.Type == "Movie" || jm.Type == "TVSeries" || jm.Type == "TVMiniSeries" {
+			m.Title = jm.Name
+			m.Plot = jm.Description
+			m.Runtime = formatDuration(jm.Duration)
+			m.PosterURL = extractImage(jm.Image)
+			m.Genres = extractGenres(jm.Genre)
+			m.Director = extractPerson(jm.Director)
+			m.Cast = extractPersonList(jm.Actor)
+
+			if jm.DatePublished != "" {
+				m.Year = extractYear(jm.DatePublished)
+			}
+
+			if jm.Rating != nil {
+				m.Rating = toFloat(jm.Rating.RatingValue)
+				m.Votes = toInt(jm.Rating.RatingCount)
+			}
+
+			break
+		}
+	}
+
+	// Box office from DOM (not in JSON-LD)
+	_ = doc.ForEach("li[data-testid='title-boxoffice-section'] span", func(n extractor.Node) error {
+		txt, _ := n.Text()
+		if strings.HasPrefix(txt, "$") {
+			m.BoxOffice = txt
+		}
+		return nil
+	})
+
+	// Fallback: title from DOM if JSON-LD didn't work
+	if m.Title == "" {
+		titles := doc.Select("h1[data-testid='hero__pageTitle'] span")
+		if len(titles) == 0 {
+			titles = doc.Select("h1")
+		}
+		if len(titles) > 0 {
+			m.Title, _ = titles[0].Text()
+			m.Title = strings.TrimSpace(m.Title)
+		}
+	}
+
+	return m, nil
+}
+
+func extractSearchResults(doc extractor.Node) ([]SearchResult, error) {
+	var results []SearchResult
+
+	_ = doc.ForEach("li.find-result-item", func(n extractor.Node) error {
+		var sr SearchResult
+
+		// Title link contains the ID in href
+		links := n.Select("a.ipc-metadata-list-summary-item__t")
+		if len(links) > 0 {
+			sr.Title, _ = links[0].Text()
+			sr.Title = strings.TrimSpace(sr.Title)
+
+			href, _ := links[0].Attr("href")
+			sr.ID = extractTitleID(href)
+		}
+
+		// Year
+		years := n.Select("span.ipc-metadata-list-summary-item__li")
+		if len(years) > 0 {
+			txt, _ := years[0].Text()
+			sr.Year = extractYear(txt)
+		}
+
+		if sr.Title != "" {
+			results = append(results, sr)
+		}
+		return nil
+	})
+
+	return results, nil
+}
+
+// extractTitleID extracts "tt1234567" from an IMDB URL path like "/title/tt1234567/".
+func extractTitleID(href string) string {
+	parts := strings.Split(href, "/")
+	for _, p := range parts {
+		if strings.HasPrefix(p, "tt") {
+			return p
+		}
+	}
+	return ""
+}
+
+func extractImage(v interface{}) string {
+	switch img := v.(type) {
+	case string:
+		return img
+	case map[string]interface{}:
+		if url, ok := img["url"].(string); ok {
+			return url
+		}
+	}
+	return ""
+}
+
+func extractGenres(v interface{}) []string {
+	switch g := v.(type) {
+	case string:
+		return []string{g}
+	case []interface{}:
+		var genres []string
+		for _, item := range g {
+			if s, ok := item.(string); ok {
+				genres = append(genres, s)
+			}
+		}
+		return genres
+	}
+	return nil
+}
+
+func extractPerson(v interface{}) string {
+	switch p := v.(type) {
+	case map[string]interface{}:
+		if name, ok := p["name"].(string); ok {
+			return name
+		}
+	case []interface{}:
+		if len(p) > 0 {
+			return extractPerson(p[0])
+		}
+	}
+	return ""
+}
+
+func extractPersonList(v interface{}) []string {
+	switch a := v.(type) {
+	case []interface{}:
+		var people []string
+		for _, item := range a {
+			if name := extractPerson(item); name != "" {
+				people = append(people, name)
+			}
+		}
+		return people
+	case map[string]interface{}:
+		if name := extractPerson(a); name != "" {
+			return []string{name}
+		}
+	}
+	return nil
+}
+
+func extractYear(s string) int {
+	// Try to find a 4-digit year in the string
+	for i := 0; i <= len(s)-4; i++ {
+		if s[i] >= '0' && s[i] <= '9' {
+			chunk := s[i : i+4]
+			if y, err := strconv.Atoi(chunk); err == nil && y >= 1888 && y <= 2100 {
+				return y
+			}
+		}
+	}
+	return 0
+}
+
+// formatDuration converts ISO 8601 duration (PT2H30M) to human-readable form.
+func formatDuration(iso string) string {
+	if iso == "" {
+		return ""
+	}
+
+	iso = strings.TrimPrefix(iso, "PT")
+	iso = strings.TrimPrefix(iso, "pt")
+
+	if iso == "" {
+		return ""
+	}
+
+	var parts []string
+	var num string
+
+	for _, c := range iso {
+		switch {
+		case c >= '0' && c <= '9':
+			num += string(c)
+		case c == 'H' || c == 'h':
+			if num != "" {
+				parts = append(parts, num+"h")
+				num = ""
+			}
+		case c == 'M' || c == 'm':
+			if num != "" {
+				parts = append(parts, num+"m")
+				num = ""
+			}
+		}
+	}
+
+	if len(parts) == 0 {
+		return iso
+	}
+
+	return strings.Join(parts, " ")
+}
+
+func toFloat(v interface{}) float64 {
+	switch f := v.(type) {
+	case float64:
+		return f
+	case string:
+		var val float64
+		fmt.Sscanf(f, "%f", &val)
+		return val
+	}
+	return 0
+}
+
+func toInt(v interface{}) int {
+	switch i := v.(type) {
+	case float64:
+		return int(i)
+	case string:
+		val, _ := strconv.Atoi(strings.ReplaceAll(i, ",", ""))
+		return val
+	}
+	return 0
+}
diff --git a/sites/imdb/imdb_test.go b/sites/imdb/imdb_test.go
new file mode 100644
index 0000000..01cd846
--- /dev/null
+++ b/sites/imdb/imdb_test.go
@@ -0,0 +1,331 @@
+package imdb
+
+import (
+	"context"
+	"testing"
+
+	"gitea.stevedudenhoeffer.com/steve/go-extractor"
+	"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
+)
+
+const sampleMovieJSONLD = `{
+  "@type": "Movie",
+  "name": "The Shawshank Redemption",
+  "description": "Two imprisoned men bond over a number of years.",
+  "datePublished": "1994-10-14",
+  "image": "https://example.com/shawshank.jpg",
+  "duration": "PT2H22M",
+  "genre": ["Drama"],
+  "director": {"@type": "Person", "name": "Frank Darabont"},
+  "actor": [
+    {"@type": "Person", "name": "Tim Robbins"},
+    {"@type": "Person", "name": "Morgan Freeman"}
+  ],
+  "aggregateRating": {
+    "ratingValue": "9.3",
+    "ratingCount": "2800000"
+  }
+}`
+
+const sampleTVSeriesJSONLD = `{
+  "@type": "TVSeries",
+  "name": "Breaking Bad",
+  "description": "A chemistry teacher turned drug lord.",
+  "datePublished": "2008-01-20",
+  "duration": "PT49M",
+  "genre": ["Crime", "Drama", "Thriller"],
+  "director": {"@type": "Person", "name": "Vince Gilligan"},
+  "actor": [
+    {"@type": "Person", "name": "Bryan Cranston"},
+    {"@type": "Person", "name": "Aaron Paul"}
+  ],
+  "aggregateRating": {
+    "ratingValue": 9.5,
+    "ratingCount": 2100000
+  }
+}`
+
+func makeMovieDoc() *extractortest.MockDocument {
+	return &extractortest.MockDocument{
+		URLValue: "https://www.imdb.com/title/tt0111161/",
+		MockNode: extractortest.MockNode{
+			Children: map[string]extractor.Nodes{
+				"script[type='application/ld+json']": {
+					&extractortest.MockNode{TextValue: sampleMovieJSONLD},
+				},
+				"li[data-testid='title-boxoffice-section'] span": {
+					&extractortest.MockNode{TextValue: "Gross worldwide"},
+					&extractortest.MockNode{TextValue: "$58,300,000"},
+				},
+			},
+		},
+	}
+}
+
+func TestExtractMovie(t *testing.T) {
+	doc := makeMovieDoc()
+
+	m, err := extractMovie(doc, "tt0111161")
+	if err != nil {
+		t.Fatalf("extractMovie() error: %v", err)
+	}
+
+	if m.ID != "tt0111161" {
+		t.Errorf("ID = %q, want %q", m.ID, "tt0111161")
+	}
+	if m.Title != "The Shawshank Redemption" {
+		t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption")
+	}
+	if m.Year != 1994 {
+		t.Errorf("Year = %d, want 1994", m.Year)
+	}
+	if m.Rating != 9.3 {
+		t.Errorf("Rating = %v, want 9.3", m.Rating)
+	}
+	if m.Votes != 2800000 {
+		t.Errorf("Votes = %d, want 2800000", m.Votes)
+	}
+	if m.Runtime != "2h 22m" {
+		t.Errorf("Runtime = %q, want %q", m.Runtime, "2h 22m")
+	}
+	if len(m.Genres) != 1 || m.Genres[0] != "Drama" {
+		t.Errorf("Genres = %v, want [Drama]", m.Genres)
+	}
+	if m.Director != "Frank Darabont" {
+		t.Errorf("Director = %q, want %q", m.Director, "Frank Darabont")
+	}
+	if len(m.Cast) != 2 {
+		t.Fatalf("len(Cast) = %d, want 2", len(m.Cast))
+	}
+	if m.Cast[0] != "Tim Robbins" {
+		t.Errorf("Cast[0] = %q, want %q", m.Cast[0], "Tim Robbins")
+	}
+	if m.Plot != "Two imprisoned men bond over a number of years." {
+		t.Errorf("Plot = %q, want correct value", m.Plot)
+	}
+	if m.PosterURL != "https://example.com/shawshank.jpg" {
+		t.Errorf("PosterURL = %q, want %q", m.PosterURL, "https://example.com/shawshank.jpg")
+	}
+	if m.BoxOffice != "$58,300,000" {
+		t.Errorf("BoxOffice = %q, want %q", m.BoxOffice, "$58,300,000")
+	}
+}
+
+func TestExtractMovie_TVSeries(t *testing.T) {
+	doc := &extractortest.MockDocument{
+		MockNode: extractortest.MockNode{
+			Children: map[string]extractor.Nodes{
+				"script[type='application/ld+json']": {
+					&extractortest.MockNode{TextValue: sampleTVSeriesJSONLD},
+				},
+			},
+		},
+	}
+
+	m, err := extractMovie(doc, "tt0903747")
+	if err != nil {
+		t.Fatalf("extractMovie() error: %v", err)
+	}
+
+	if m.Title != "Breaking Bad" {
+		t.Errorf("Title = %q, want %q", m.Title, "Breaking Bad")
+	}
+	if m.Year != 2008 {
+		t.Errorf("Year = %d, want 2008", m.Year)
+	}
+	if m.Rating != 9.5 {
+		t.Errorf("Rating = %v, want 9.5", m.Rating)
+	}
+	if m.Votes != 2100000 {
+		t.Errorf("Votes = %d, want 2100000", m.Votes)
+	}
+	if len(m.Genres) != 3 {
+		t.Errorf("len(Genres) = %d, want 3", len(m.Genres))
+	}
+	if m.Runtime != "49m" {
+		t.Errorf("Runtime = %q, want %q", m.Runtime, "49m")
+	}
+}
+
+func TestExtractMovie_DOMFallback(t *testing.T) {
+	doc := &extractortest.MockDocument{
+		MockNode: extractortest.MockNode{
+			Children: map[string]extractor.Nodes{
+				"h1[data-testid='hero__pageTitle'] span": {
+					&extractortest.MockNode{TextValue: "Inception"},
+				},
+			},
+		},
+	}
+
+	m, err := extractMovie(doc, "tt1375666")
+	if err != nil {
+		t.Fatalf("extractMovie() error: %v", err)
+	}
+
+	if m.Title != "Inception" {
+		t.Errorf("Title = %q, want %q", m.Title, "Inception")
+	}
+}
+
+func TestExtractSearchResults(t *testing.T) {
+	doc := &extractortest.MockDocument{
+		MockNode: extractortest.MockNode{
+			Children: map[string]extractor.Nodes{
+				"li.find-result-item": {
+					&extractortest.MockNode{
+						Children: map[string]extractor.Nodes{
+							"a.ipc-metadata-list-summary-item__t": {
+								&extractortest.MockNode{
+									TextValue: "The Matrix",
+									Attrs:     map[string]string{"href": "/title/tt0133093/"},
+								},
+							},
+							"span.ipc-metadata-list-summary-item__li": {
+								&extractortest.MockNode{TextValue: "1999"},
+							},
+						},
+					},
+					&extractortest.MockNode{
+						Children: map[string]extractor.Nodes{
+							"a.ipc-metadata-list-summary-item__t": {
+								&extractortest.MockNode{
+									TextValue: "The Matrix Reloaded",
+									Attrs:     map[string]string{"href": "/title/tt0234215/"},
+								},
+							},
+							"span.ipc-metadata-list-summary-item__li": {
+								&extractortest.MockNode{TextValue: "2003"},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	results, err := extractSearchResults(doc)
+	if err != nil {
+		t.Fatalf("extractSearchResults() error: %v", err)
+	}
+
+	if len(results) != 2 {
+		t.Fatalf("len(results) = %d, want 2", len(results))
+	}
+
+	if results[0].Title != "The Matrix" {
+		t.Errorf("results[0].Title = %q, want %q", results[0].Title, "The Matrix")
+	}
+	if results[0].ID != "tt0133093" {
+		t.Errorf("results[0].ID = %q, want %q", results[0].ID, "tt0133093")
+	}
+	if results[0].Year != 1999 {
+		t.Errorf("results[0].Year = %d, want 1999", results[0].Year)
+	}
+	if results[1].Title != "The Matrix Reloaded" {
+		t.Errorf("results[1].Title = %q, want %q", results[1].Title, "The Matrix Reloaded")
+	}
+}
+
+func TestGetMovie_MockBrowser(t *testing.T) {
+	doc := makeMovieDoc()
+
+	browser := &extractortest.MockBrowser{
+		Documents: map[string]*extractortest.MockDocument{
+			"https://www.imdb.com/title/tt0111161/": doc,
+		},
+	}
+
+	m, err := DefaultConfig.GetMovie(context.Background(), browser, "tt0111161")
+	if err != nil {
+		t.Fatalf("GetMovie() error: %v", err)
+	}
+
+	if m.Title != "The Shawshank Redemption" {
+		t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption")
+	}
+	if m.Rating != 9.3 {
+		t.Errorf("Rating = %v, want 9.3", m.Rating)
+	}
+}
+
+func TestExtractMovie_Empty(t *testing.T) {
+	doc := &extractortest.MockDocument{
+		MockNode: extractortest.MockNode{
+			Children: map[string]extractor.Nodes{},
+		},
+	}
+
+	m, err := extractMovie(doc, "tt0000000")
+	if err != nil {
+		t.Fatalf("extractMovie() error: %v", err)
+	}
+
+	if m.Title != "" || m.Rating != 0 {
+		t.Error("expected zero values for empty doc")
+	}
+}
+
+func TestExtractTitleID(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{"/title/tt0111161/", "tt0111161"},
+		{"/title/tt0133093/?ref_=fn_al_tt_1", "tt0133093"},
+		{"/name/nm0000151/", ""},
+	}
+
+	for _, tt := range tests {
+		got := extractTitleID(tt.input)
+		if got != tt.want {
+			t.Errorf("extractTitleID(%q) = %q, want %q", tt.input, got, tt.want)
+		}
+	}
+}
+
+func TestExtractYear(t *testing.T) {
+	tests := []struct {
+		input string
+		want  int
+	}{
+		{"1994-10-14", 1994},
+		{"2008", 2008},
+		{"(1999)", 1999},
+		{"no year here", 0},
+	}
+
+	for _, tt := range tests {
+		got := extractYear(tt.input)
+		if got != tt.want {
+			t.Errorf("extractYear(%q) = %d, want %d", tt.input, got, tt.want)
+		}
+	}
+}
+
+func TestFormatDuration(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{"PT2H22M", "2h 22m"},
+		{"PT49M", "49m"},
+		{"PT1H", "1h"},
+		{"", ""},
+	}
+
+	for _, tt := range tests {
+		got := formatDuration(tt.input)
+		if got != tt.want {
+			t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want)
+		}
+	}
+}
+
+func TestTitleURL(t *testing.T) {
+	got := titleURL("tt0111161")
+	want := "https://www.imdb.com/title/tt0111161/"
+	if got != want {
+		t.Errorf("titleURL(\"tt0111161\") = %q, want %q", got, want)
+	}
+}
-- 
2.49.1