This commit is contained in:
commit
0ef15167d5
28 changed files with 2789 additions and 0 deletions
163
pkg/scraper/scraper_test.go
Normal file
163
pkg/scraper/scraper_test.go
Normal file
|
@ -0,0 +1,163 @@
|
|||
package scraper
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestResolveURL(t *testing.T) {
|
||||
s := &Scraper{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
href string
|
||||
sourceURL string
|
||||
baseURL string
|
||||
wantResult string
|
||||
}{
|
||||
{
|
||||
name: "Absolute URL",
|
||||
href: "https://example.com/page.html",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "https://example.com/page.html",
|
||||
},
|
||||
{
|
||||
name: "Relative URL",
|
||||
href: "page.html",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "https://example.org/page.html",
|
||||
},
|
||||
{
|
||||
name: "Anchor link",
|
||||
href: "#section",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "",
|
||||
},
|
||||
{
|
||||
name: "JavaScript link",
|
||||
href: "javascript:void(0)",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := s.resolveURL(tt.href, tt.sourceURL)
|
||||
if got != tt.wantResult {
|
||||
t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
s := New(
|
||||
WithConcurrency(20),
|
||||
WithDepth(5),
|
||||
WithTimeout(30),
|
||||
WithVerbose(true),
|
||||
)
|
||||
|
||||
if s.concurrency != 20 {
|
||||
t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
|
||||
}
|
||||
|
||||
if s.depth != 5 {
|
||||
t.Errorf("Expected depth to be 5, got %d", s.depth)
|
||||
}
|
||||
|
||||
if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
|
||||
t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
|
||||
}
|
||||
|
||||
if !s.verbose {
|
||||
t.Errorf("Expected verbose to be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddResult(t *testing.T) {
|
||||
s := &Scraper{}
|
||||
results := &Results{}
|
||||
|
||||
// Add an error result
|
||||
errorResult := Result{
|
||||
URL: "https://example.com/error",
|
||||
Error: "Test error",
|
||||
Type: "link",
|
||||
}
|
||||
s.addResult(results, errorResult)
|
||||
|
||||
if len(results.Errors) != 1 {
|
||||
t.Errorf("Expected 1 error, got %d", len(results.Errors))
|
||||
}
|
||||
|
||||
// Add a success result
|
||||
successResult := Result{
|
||||
URL: "https://example.com/success",
|
||||
Status: 200,
|
||||
Type: "link",
|
||||
}
|
||||
s.addResult(results, successResult)
|
||||
|
||||
if len(results.Successes) != 1 {
|
||||
t.Errorf("Expected 1 success, got %d", len(results.Successes))
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessURL(t *testing.T) {
|
||||
// Create a test server
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
if _, err := w.Write([]byte(`
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" href="/style.css">
|
||||
<script src="/script.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<a href="/page1.html">Page 1</a>
|
||||
<a href="https://example.com">External</a>
|
||||
<img src="/image.jpg">
|
||||
</body>
|
||||
</html>
|
||||
`)); err != nil {
|
||||
t.Fatalf("Failed to write response: %v", err)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
s := New(WithDepth(1), WithConcurrency(1))
|
||||
results := &Results{}
|
||||
|
||||
// Create a channel for QueueItems instead of strings
|
||||
queue := make(chan QueueItem, 10)
|
||||
|
||||
// Create active count channel
|
||||
activeCount := make(chan int, 1)
|
||||
activeCount <- 1 // Start with one active URL
|
||||
|
||||
// Parse the server URL to get the hostname
|
||||
serverURL, _ := url.Parse(server.URL)
|
||||
baseHostname := serverURL.Hostname()
|
||||
|
||||
// Process the URL with the updated signature
|
||||
s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
|
||||
|
||||
// Check that we found at least one success (the main page)
|
||||
if len(results.Successes) < 1 {
|
||||
t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
|
||||
}
|
||||
|
||||
// Check that we queued some URLs for processing
|
||||
if len(queue) < 1 {
|
||||
t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue