feature: add IMDB movie/TV extractor
Add sites/imdb package with GetMovie() and Search() methods. Extracts title, year, rating, votes, runtime, genres, director, cast, plot, poster, and box office data. Uses JSON-LD parsing with DOM fallback. Supports Movie, TVSeries, and TVMiniSeries types. Closes #30 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
331
sites/imdb/imdb_test.go
Normal file
331
sites/imdb/imdb_test.go
Normal file
@@ -0,0 +1,331 @@
|
||||
package imdb
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
||||
"gitea.stevedudenhoeffer.com/steve/go-extractor/extractortest"
|
||||
)
|
||||
|
||||
const sampleMovieJSONLD = `{
|
||||
"@type": "Movie",
|
||||
"name": "The Shawshank Redemption",
|
||||
"description": "Two imprisoned men bond over a number of years.",
|
||||
"datePublished": "1994-10-14",
|
||||
"image": "https://example.com/shawshank.jpg",
|
||||
"duration": "PT2H22M",
|
||||
"genre": ["Drama"],
|
||||
"director": {"@type": "Person", "name": "Frank Darabont"},
|
||||
"actor": [
|
||||
{"@type": "Person", "name": "Tim Robbins"},
|
||||
{"@type": "Person", "name": "Morgan Freeman"}
|
||||
],
|
||||
"aggregateRating": {
|
||||
"ratingValue": "9.3",
|
||||
"ratingCount": "2800000"
|
||||
}
|
||||
}`
|
||||
|
||||
const sampleTVSeriesJSONLD = `{
|
||||
"@type": "TVSeries",
|
||||
"name": "Breaking Bad",
|
||||
"description": "A chemistry teacher turned drug lord.",
|
||||
"datePublished": "2008-01-20",
|
||||
"duration": "PT49M",
|
||||
"genre": ["Crime", "Drama", "Thriller"],
|
||||
"director": {"@type": "Person", "name": "Vince Gilligan"},
|
||||
"actor": [
|
||||
{"@type": "Person", "name": "Bryan Cranston"},
|
||||
{"@type": "Person", "name": "Aaron Paul"}
|
||||
],
|
||||
"aggregateRating": {
|
||||
"ratingValue": 9.5,
|
||||
"ratingCount": 2100000
|
||||
}
|
||||
}`
|
||||
|
||||
func makeMovieDoc() *extractortest.MockDocument {
|
||||
return &extractortest.MockDocument{
|
||||
URLValue: "https://www.imdb.com/title/tt0111161/",
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"script[type='application/ld+json']": {
|
||||
&extractortest.MockNode{TextValue: sampleMovieJSONLD},
|
||||
},
|
||||
"li[data-testid='title-boxoffice-section'] span": {
|
||||
&extractortest.MockNode{TextValue: "Gross worldwide"},
|
||||
&extractortest.MockNode{TextValue: "$58,300,000"},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMovie(t *testing.T) {
|
||||
doc := makeMovieDoc()
|
||||
|
||||
m, err := extractMovie(doc, "tt0111161")
|
||||
if err != nil {
|
||||
t.Fatalf("extractMovie() error: %v", err)
|
||||
}
|
||||
|
||||
if m.ID != "tt0111161" {
|
||||
t.Errorf("ID = %q, want %q", m.ID, "tt0111161")
|
||||
}
|
||||
if m.Title != "The Shawshank Redemption" {
|
||||
t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption")
|
||||
}
|
||||
if m.Year != 1994 {
|
||||
t.Errorf("Year = %d, want 1994", m.Year)
|
||||
}
|
||||
if m.Rating != 9.3 {
|
||||
t.Errorf("Rating = %v, want 9.3", m.Rating)
|
||||
}
|
||||
if m.Votes != 2800000 {
|
||||
t.Errorf("Votes = %d, want 2800000", m.Votes)
|
||||
}
|
||||
if m.Runtime != "2h 22m" {
|
||||
t.Errorf("Runtime = %q, want %q", m.Runtime, "2h 22m")
|
||||
}
|
||||
if len(m.Genres) != 1 || m.Genres[0] != "Drama" {
|
||||
t.Errorf("Genres = %v, want [Drama]", m.Genres)
|
||||
}
|
||||
if m.Director != "Frank Darabont" {
|
||||
t.Errorf("Director = %q, want %q", m.Director, "Frank Darabont")
|
||||
}
|
||||
if len(m.Cast) != 2 {
|
||||
t.Fatalf("len(Cast) = %d, want 2", len(m.Cast))
|
||||
}
|
||||
if m.Cast[0] != "Tim Robbins" {
|
||||
t.Errorf("Cast[0] = %q, want %q", m.Cast[0], "Tim Robbins")
|
||||
}
|
||||
if m.Plot != "Two imprisoned men bond over a number of years." {
|
||||
t.Errorf("Plot = %q, want correct value", m.Plot)
|
||||
}
|
||||
if m.PosterURL != "https://example.com/shawshank.jpg" {
|
||||
t.Errorf("PosterURL = %q, want %q", m.PosterURL, "https://example.com/shawshank.jpg")
|
||||
}
|
||||
if m.BoxOffice != "$58,300,000" {
|
||||
t.Errorf("BoxOffice = %q, want %q", m.BoxOffice, "$58,300,000")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMovie_TVSeries(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"script[type='application/ld+json']": {
|
||||
&extractortest.MockNode{TextValue: sampleTVSeriesJSONLD},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m, err := extractMovie(doc, "tt0903747")
|
||||
if err != nil {
|
||||
t.Fatalf("extractMovie() error: %v", err)
|
||||
}
|
||||
|
||||
if m.Title != "Breaking Bad" {
|
||||
t.Errorf("Title = %q, want %q", m.Title, "Breaking Bad")
|
||||
}
|
||||
if m.Year != 2008 {
|
||||
t.Errorf("Year = %d, want 2008", m.Year)
|
||||
}
|
||||
if m.Rating != 9.5 {
|
||||
t.Errorf("Rating = %v, want 9.5", m.Rating)
|
||||
}
|
||||
if m.Votes != 2100000 {
|
||||
t.Errorf("Votes = %d, want 2100000", m.Votes)
|
||||
}
|
||||
if len(m.Genres) != 3 {
|
||||
t.Errorf("len(Genres) = %d, want 3", len(m.Genres))
|
||||
}
|
||||
if m.Runtime != "49m" {
|
||||
t.Errorf("Runtime = %q, want %q", m.Runtime, "49m")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMovie_DOMFallback(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"h1[data-testid='hero__pageTitle'] span": {
|
||||
&extractortest.MockNode{TextValue: "Inception"},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m, err := extractMovie(doc, "tt1375666")
|
||||
if err != nil {
|
||||
t.Fatalf("extractMovie() error: %v", err)
|
||||
}
|
||||
|
||||
if m.Title != "Inception" {
|
||||
t.Errorf("Title = %q, want %q", m.Title, "Inception")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractSearchResults(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"li.find-result-item": {
|
||||
&extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"a.ipc-metadata-list-summary-item__t": {
|
||||
&extractortest.MockNode{
|
||||
TextValue: "The Matrix",
|
||||
Attrs: map[string]string{"href": "/title/tt0133093/"},
|
||||
},
|
||||
},
|
||||
"span.ipc-metadata-list-summary-item__li": {
|
||||
&extractortest.MockNode{TextValue: "1999"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{
|
||||
"a.ipc-metadata-list-summary-item__t": {
|
||||
&extractortest.MockNode{
|
||||
TextValue: "The Matrix Reloaded",
|
||||
Attrs: map[string]string{"href": "/title/tt0234215/"},
|
||||
},
|
||||
},
|
||||
"span.ipc-metadata-list-summary-item__li": {
|
||||
&extractortest.MockNode{TextValue: "2003"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
results, err := extractSearchResults(doc)
|
||||
if err != nil {
|
||||
t.Fatalf("extractSearchResults() error: %v", err)
|
||||
}
|
||||
|
||||
if len(results) != 2 {
|
||||
t.Fatalf("len(results) = %d, want 2", len(results))
|
||||
}
|
||||
|
||||
if results[0].Title != "The Matrix" {
|
||||
t.Errorf("results[0].Title = %q, want %q", results[0].Title, "The Matrix")
|
||||
}
|
||||
if results[0].ID != "tt0133093" {
|
||||
t.Errorf("results[0].ID = %q, want %q", results[0].ID, "tt0133093")
|
||||
}
|
||||
if results[0].Year != 1999 {
|
||||
t.Errorf("results[0].Year = %d, want 1999", results[0].Year)
|
||||
}
|
||||
if results[1].Title != "The Matrix Reloaded" {
|
||||
t.Errorf("results[1].Title = %q, want %q", results[1].Title, "The Matrix Reloaded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMovie_MockBrowser(t *testing.T) {
|
||||
doc := makeMovieDoc()
|
||||
|
||||
browser := &extractortest.MockBrowser{
|
||||
Documents: map[string]*extractortest.MockDocument{
|
||||
"https://www.imdb.com/title/tt0111161/": doc,
|
||||
},
|
||||
}
|
||||
|
||||
m, err := DefaultConfig.GetMovie(context.Background(), browser, "tt0111161")
|
||||
if err != nil {
|
||||
t.Fatalf("GetMovie() error: %v", err)
|
||||
}
|
||||
|
||||
if m.Title != "The Shawshank Redemption" {
|
||||
t.Errorf("Title = %q, want %q", m.Title, "The Shawshank Redemption")
|
||||
}
|
||||
if m.Rating != 9.3 {
|
||||
t.Errorf("Rating = %v, want 9.3", m.Rating)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMovie_Empty(t *testing.T) {
|
||||
doc := &extractortest.MockDocument{
|
||||
MockNode: extractortest.MockNode{
|
||||
Children: map[string]extractor.Nodes{},
|
||||
},
|
||||
}
|
||||
|
||||
m, err := extractMovie(doc, "tt0000000")
|
||||
if err != nil {
|
||||
t.Fatalf("extractMovie() error: %v", err)
|
||||
}
|
||||
|
||||
if m.Title != "" || m.Rating != 0 {
|
||||
t.Error("expected zero values for empty doc")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractTitleID(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"/title/tt0111161/", "tt0111161"},
|
||||
{"/title/tt0133093/?ref_=fn_al_tt_1", "tt0133093"},
|
||||
{"/name/nm0000151/", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := extractTitleID(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("extractTitleID(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractYear(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want int
|
||||
}{
|
||||
{"1994-10-14", 1994},
|
||||
{"2008", 2008},
|
||||
{"(1999)", 1999},
|
||||
{"no year here", 0},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := extractYear(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("extractYear(%q) = %d, want %d", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatDuration(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"PT2H22M", "2h 22m"},
|
||||
{"PT49M", "49m"},
|
||||
{"PT1H", "1h"},
|
||||
{"", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := formatDuration(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTitleURL(t *testing.T) {
|
||||
got := titleURL("tt0111161")
|
||||
want := "https://www.imdb.com/title/tt0111161/"
|
||||
if got != want {
|
||||
t.Errorf("titleURL(\"tt0111161\") = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user