go-extractor/readability_test.go

package extractor

import (
	"context"
	"testing"
)

func TestReadability_ValidHTML(t *testing.T) {
	html := `<!DOCTYPE html>
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1>Test Article</h1>
<p>This is a test article with enough content to be parsed by readability.
It needs to have a reasonable amount of text so the algorithm considers it
a valid article. Let us add several sentences to make sure this works
correctly. The readability library requires a minimum amount of content
to successfully extract an article from a page.</p>
<p>Here is another paragraph to add more content. We want to make sure
that the content is substantial enough for the readability algorithm to
consider this a valid article and extract the text properly.</p>
</article>
</body>
</html>`

	doc := mockDocument{
		url:     "https://example.com/article",
		content: html,
	}

	article, err := Readability(context.Background(), doc)
	if err != nil {
		t.Fatalf("Readability() error = %v", err)
	}

	if article.Title != "Test Article" {
		t.Errorf("Title = %q, want %q", article.Title, "Test Article")
	}

	if article.TextContent == "" {
		t.Error("TextContent should not be empty")
	}
}

func TestReadability_EmptyContent(t *testing.T) {
	doc := mockDocument{
		url:     "https://example.com/empty",
		content: "",
	}

	article, err := Readability(context.Background(), doc)
	if err != nil {
		t.Fatalf("Readability() unexpected error = %v", err)
	}
	// Empty content should produce an empty article.
	if article.Title != "" && article.TextContent != "" {
		t.Error("expected empty article from empty content")
	}
}

func TestReadability_InvalidURL(t *testing.T) {
	doc := mockDocument{
		url:     "://invalid",
		content: "<html><body><p>text</p></body></html>",
	}

	_, err := Readability(context.Background(), doc)
	if err == nil {
		t.Error("Readability() expected error for invalid URL, got nil")
	}
}