163 lines
3.8 KiB
Go
163 lines
3.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"net/url"
|
|
"testing"
|
|
)
|
|
|
|
func TestResolveURL(t *testing.T) {
|
|
s := &Scraper{}
|
|
|
|
tests := []struct {
|
|
name string
|
|
href string
|
|
sourceURL string
|
|
baseURL string
|
|
wantResult string
|
|
}{
|
|
{
|
|
name: "Absolute URL",
|
|
href: "https://example.com/page.html",
|
|
sourceURL: "https://example.org/index.html",
|
|
baseURL: "https://example.org/",
|
|
wantResult: "https://example.com/page.html",
|
|
},
|
|
{
|
|
name: "Relative URL",
|
|
href: "page.html",
|
|
sourceURL: "https://example.org/index.html",
|
|
baseURL: "https://example.org/",
|
|
wantResult: "https://example.org/page.html",
|
|
},
|
|
{
|
|
name: "Anchor link",
|
|
href: "#section",
|
|
sourceURL: "https://example.org/index.html",
|
|
baseURL: "https://example.org/",
|
|
wantResult: "",
|
|
},
|
|
{
|
|
name: "JavaScript link",
|
|
href: "javascript:void(0)",
|
|
sourceURL: "https://example.org/index.html",
|
|
baseURL: "https://example.org/",
|
|
wantResult: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
got := s.resolveURL(tt.href, tt.sourceURL)
|
|
if got != tt.wantResult {
|
|
t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestNew(t *testing.T) {
|
|
s := New(
|
|
WithConcurrency(20),
|
|
WithDepth(5),
|
|
WithTimeout(30),
|
|
WithVerbose(true),
|
|
)
|
|
|
|
if s.concurrency != 20 {
|
|
t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
|
|
}
|
|
|
|
if s.depth != 5 {
|
|
t.Errorf("Expected depth to be 5, got %d", s.depth)
|
|
}
|
|
|
|
if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
|
|
t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
|
|
}
|
|
|
|
if !s.verbose {
|
|
t.Errorf("Expected verbose to be true")
|
|
}
|
|
}
|
|
|
|
func TestAddResult(t *testing.T) {
|
|
s := &Scraper{}
|
|
results := &Results{}
|
|
|
|
// Add an error result
|
|
errorResult := Result{
|
|
URL: "https://example.com/error",
|
|
Error: "Test error",
|
|
Type: "link",
|
|
}
|
|
s.addResult(results, errorResult)
|
|
|
|
if len(results.Errors) != 1 {
|
|
t.Errorf("Expected 1 error, got %d", len(results.Errors))
|
|
}
|
|
|
|
// Add a success result
|
|
successResult := Result{
|
|
URL: "https://example.com/success",
|
|
Status: 200,
|
|
Type: "link",
|
|
}
|
|
s.addResult(results, successResult)
|
|
|
|
if len(results.Successes) != 1 {
|
|
t.Errorf("Expected 1 success, got %d", len(results.Successes))
|
|
}
|
|
}
|
|
|
|
func TestProcessURL(t *testing.T) {
|
|
// Create a test server
|
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
if _, err := w.Write([]byte(`
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<link rel="stylesheet" href="/style.css">
|
|
<script src="/script.js"></script>
|
|
</head>
|
|
<body>
|
|
<a href="/page1.html">Page 1</a>
|
|
<a href="https://example.com">External</a>
|
|
<img src="/image.jpg">
|
|
</body>
|
|
</html>
|
|
`)); err != nil {
|
|
t.Fatalf("Failed to write response: %v", err)
|
|
}
|
|
}))
|
|
defer server.Close()
|
|
|
|
s := New(WithDepth(1), WithConcurrency(1))
|
|
results := &Results{}
|
|
|
|
// Create a channel for QueueItems instead of strings
|
|
queue := make(chan QueueItem, 10)
|
|
|
|
// Create active count channel
|
|
activeCount := make(chan int, 1)
|
|
activeCount <- 1 // Start with one active URL
|
|
|
|
// Parse the server URL to get the hostname
|
|
serverURL, _ := url.Parse(server.URL)
|
|
baseHostname := serverURL.Hostname()
|
|
|
|
// Process the URL with the updated signature
|
|
s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
|
|
|
|
// Check that we found at least one success (the main page)
|
|
if len(results.Successes) < 1 {
|
|
t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
|
|
}
|
|
|
|
// Check that we queued some URLs for processing
|
|
if len(queue) < 1 {
|
|
t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
|
|
}
|
|
}
|