package extractor import ( "context" "strings" "testing" ) func TestReadability_ValidHTML(t *testing.T) { html := ` Test Article

Test Article

This is a test article with enough content to be parsed by readability. It needs to have a reasonable amount of text so the algorithm considers it a valid article. Let us add several sentences to make sure this works correctly. The readability library requires a minimum amount of content to successfully extract an article from a page.

Here is another paragraph to add more content. We want to make sure that the content is substantial enough for the readability algorithm to consider this a valid article and extract the text properly.

` doc := mockDocument{ url: "https://example.com/article", content: html, } article, err := Readability(context.Background(), doc) if err != nil { t.Fatalf("Readability() error = %v", err) } if article.Title != "Test Article" { t.Errorf("Title = %q, want %q", article.Title, "Test Article") } if article.TextContent == "" { t.Error("TextContent should not be empty") } } func TestReadability_EmptyContent(t *testing.T) { doc := mockDocument{ url: "https://example.com/empty", content: "", } article, err := Readability(context.Background(), doc) if err != nil { t.Fatalf("Readability() unexpected error = %v", err) } // Empty content should produce an empty article. if article.Title != "" && article.TextContent != "" { t.Error("expected empty article from empty content") } } func TestReadability_InvalidURL(t *testing.T) { doc := mockDocument{ url: "://invalid", content: "

text

", } _, err := Readability(context.Background(), doc) if err == nil { t.Error("Readability() expected error for invalid URL, got nil") } } func TestReadabilityWithOptions_RemoveSelectors(t *testing.T) { html := ` Main Article

Main Article

This is the main article content that we want to extract properly. It contains several sentences about the main topic of interest. The readability algorithm should pick this up as the primary content of the page without any interference from other elements.

Here is a second paragraph with more relevant content about the main topic. This paragraph adds depth and detail to the article.

Unrelated Article

This is content from an unrelated article loaded via infinite scroll. It should not appear in the extracted content because we will remove it using the RemoveSelectors option before readability extraction.

` doc := mockDocument{ url: "https://example.com/article", content: html, } article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{ RemoveSelectors: []string{".infinite-scroll", ".sidebar"}, }) if err != nil { t.Fatalf("ReadabilityWithOptions() error = %v", err) } if article.TextContent == "" { t.Fatal("TextContent should not be empty") } if strings.Contains(article.TextContent, "Unrelated Article") { t.Error("TextContent should not contain content from removed .infinite-scroll element") } if strings.Contains(article.TextContent, "Sidebar content") { t.Error("TextContent should not contain content from removed .sidebar element") } if !strings.Contains(article.TextContent, "main article content") { t.Error("TextContent should still contain the main article content") } } func TestReadabilityWithOptions_NoSelectors(t *testing.T) { html := ` Test Article

Test Article

` doc := mockDocument{ url: "https://example.com/article", content: html, } // With empty options, should behave identically to Readability(). article, err := ReadabilityWithOptions(context.Background(), doc, ReadabilityOptions{}) if err != nil { t.Fatalf("ReadabilityWithOptions() error = %v", err) } if article.Title != "Test Article" { t.Errorf("Title = %q, want %q", article.Title, "Test Article") } if article.TextContent == "" { t.Error("TextContent should not be empty") } } func TestRemoveSelectors(t *testing.T) { html := `

Keep this

Remove this

` result, err := removeSelectors(html, []string{".remove"}) if err != nil { t.Fatalf("removeSelectors() error = %v", err) } if strings.Contains(result, "Remove this") { t.Error("result should not contain removed element content") } if !strings.Contains(result, "Keep this") { t.Error("result should still contain kept element content") } } func TestRemoveSelectors_MultipleSelectors(t *testing.T) { html := `

` result, err := removeSelectors(html, []string{".a", ".c"}) if err != nil { t.Fatalf("removeSelectors() error = %v", err) } if strings.Contains(result, ">A<") { t.Error("result should not contain element .a") } if strings.Contains(result, ">C<") { t.Error("result should not contain element .c") } if !strings.Contains(result, ">B<") { t.Error("result should still contain element .b") } }