dharma/pkg/scraper/scraper_test.go
Felipe M. 0ef15167d5
All checks were successful
ci/woodpecker/tag/release Pipeline was successful
initial release
2025-05-04 10:49:50 +02:00

163 lines
3.8 KiB
Go

package scraper
import (
"net/http"
"net/http/httptest"
"net/url"
"testing"
)
func TestResolveURL(t *testing.T) {
s := &Scraper{}
tests := []struct {
name string
href string
sourceURL string
baseURL string
wantResult string
}{
{
name: "Absolute URL",
href: "https://example.com/page.html",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "https://example.com/page.html",
},
{
name: "Relative URL",
href: "page.html",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "https://example.org/page.html",
},
{
name: "Anchor link",
href: "#section",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "",
},
{
name: "JavaScript link",
href: "javascript:void(0)",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := s.resolveURL(tt.href, tt.sourceURL)
if got != tt.wantResult {
t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
}
})
}
}
func TestNew(t *testing.T) {
s := New(
WithConcurrency(20),
WithDepth(5),
WithTimeout(30),
WithVerbose(true),
)
if s.concurrency != 20 {
t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
}
if s.depth != 5 {
t.Errorf("Expected depth to be 5, got %d", s.depth)
}
if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
}
if !s.verbose {
t.Errorf("Expected verbose to be true")
}
}
func TestAddResult(t *testing.T) {
s := &Scraper{}
results := &Results{}
// Add an error result
errorResult := Result{
URL: "https://example.com/error",
Error: "Test error",
Type: "link",
}
s.addResult(results, errorResult)
if len(results.Errors) != 1 {
t.Errorf("Expected 1 error, got %d", len(results.Errors))
}
// Add a success result
successResult := Result{
URL: "https://example.com/success",
Status: 200,
Type: "link",
}
s.addResult(results, successResult)
if len(results.Successes) != 1 {
t.Errorf("Expected 1 success, got %d", len(results.Successes))
}
}
func TestProcessURL(t *testing.T) {
// Create a test server
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
if _, err := w.Write([]byte(`
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="/style.css">
<script src="/script.js"></script>
</head>
<body>
<a href="/page1.html">Page 1</a>
<a href="https://example.com">External</a>
<img src="/image.jpg">
</body>
</html>
`)); err != nil {
t.Fatalf("Failed to write response: %v", err)
}
}))
defer server.Close()
s := New(WithDepth(1), WithConcurrency(1))
results := &Results{}
// Create a channel for QueueItems instead of strings
queue := make(chan QueueItem, 10)
// Create active count channel
activeCount := make(chan int, 1)
activeCount <- 1 // Start with one active URL
// Parse the server URL to get the hostname
serverURL, _ := url.Parse(server.URL)
baseHostname := serverURL.Hostname()
// Process the URL with the updated signature
s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
// Check that we found at least one success (the main page)
if len(results.Successes) < 1 {
t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
}
// Check that we queued some URLs for processing
if len(queue) < 1 {
t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
}
}