initial release

2025-05-04 10:40:26 +02:00 · 2025-05-04 10:40:26 +02:00 · 0ef15167d5
commit 0ef15167d5
28 changed files with 2789 additions and 0 deletions
--- a/pkg/scraper/scraper.go
+++ b/pkg/scraper/scraper.go
@ -0,0 +1,669 @@
+package scraper
+
+import (
+	"bufio"
+	"fmt"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+var (
+	urlRegexp    = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
+	importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
+)
+
+// Result represents a URL check result
+type Result struct {
+	URL        string `json:"url"`
+	SourceURL  string `json:"source_url,omitempty"`
+	Status     int    `json:"status"`
+	Error      string `json:"error,omitempty"`
+	Type       string `json:"type"` // link, image, script, stylesheet, css-import
+	IsExternal bool   `json:"is_external"`
+}
+
+// Results is a collection of Result
+type Results struct {
+	BaseURL   string   `json:"base_url"`
+	Errors    []Result `json:"errors"`
+	Successes []Result `json:"successes"`
+	Total     int      `json:"total"`
+}
+
+// QueueItem represents a URL to be processed along with its source
+type QueueItem struct {
+	URL       string
+	SourceURL string
+	Depth     int
+}
+
+// Scraper handles website crawling and link checking
+type Scraper struct {
+	client       *http.Client
+	concurrency  int
+	depth        int
+	verbose      bool
+	internalOnly bool
+	visitedURLs  map[string]bool   // URLs visited for crawling
+	checkedURLs  map[string]Result // URLs already checked to avoid duplicate requests
+	mu           sync.Mutex
+}
+
+// Option is a function option for the Scraper
+type Option func(*Scraper)
+
+// WithConcurrency sets the concurrency level
+func WithConcurrency(concurrency int) Option {
+	return func(s *Scraper) {
+		s.concurrency = concurrency
+	}
+}
+
+// WithDepth sets the maximum crawling depth
+func WithDepth(depth int) Option {
+	return func(s *Scraper) {
+		s.depth = depth
+	}
+}
+
+// WithTimeout sets the timeout for HTTP requests
+func WithTimeout(timeoutSec int) Option {
+	return func(s *Scraper) {
+		s.client.Timeout = time.Duration(timeoutSec) * time.Second
+	}
+}
+
+// WithVerbose enables verbose output
+func WithVerbose(verbose bool) Option {
+	return func(s *Scraper) {
+		s.verbose = verbose
+	}
+}
+
+// WithInternalOnly sets whether to only check internal links
+func WithInternalOnly(internalOnly bool) Option {
+	return func(s *Scraper) {
+		s.internalOnly = internalOnly
+	}
+}
+
+// New creates a new Scraper with the given options
+func New(options ...Option) *Scraper {
+	s := &Scraper{
+		client: &http.Client{
+			Timeout: 10 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		concurrency: 10,
+		depth:       3,
+		visitedURLs: make(map[string]bool),
+		checkedURLs: make(map[string]Result),
+	}
+
+	for _, option := range options {
+		option(s)
+	}
+
+	return s
+}
+
+// Scan starts the website crawling process
+func (s *Scraper) Scan(baseURL string) (*Results, error) {
+	parsedURL, err := url.Parse(baseURL)
+	if err != nil {
+		return nil, fmt.Errorf("invalid URL: %w", err)
+	}
+
+	// Ensure the base URL has a scheme
+	if parsedURL.Scheme == "" {
+		parsedURL.Scheme = "https"
+		baseURL = parsedURL.String()
+	}
+
+	// Store the base hostname for distinguishing internal vs external links
+	baseHostname := parsedURL.Hostname()
+
+	results := &Results{
+		BaseURL: baseURL,
+	}
+
+	// Create a waitgroup to track active workers
+	var wg sync.WaitGroup
+
+	// Create a channel to communicate URLs to process
+	queue := make(chan QueueItem, 1000)
+
+	// Create a channel to track active URL processing
+	activeCount := make(chan int, 1)
+	activeCount <- 1 // Start with 1 active URL (the base URL)
+
+	// Start worker pool
+	for range s.concurrency {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for item := range queue {
+				s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
+			}
+		}()
+	}
+
+	// Initial URL to process - the source is itself for the initial URL
+	queue <- QueueItem{
+		URL:       baseURL,
+		SourceURL: baseURL,
+		Depth:     0,
+	}
+
+	// Monitor active count - when it reaches 0, we're done
+	go func() {
+		for {
+			count := <-activeCount
+			if count <= 0 {
+				close(queue)
+				return
+			}
+			activeCount <- count
+		}
+	}()
+
+	// Wait for workers to finish
+	wg.Wait()
+
+	results.Total = len(results.Errors) + len(results.Successes)
+	return results, nil
+}
+
+// processURL processes a single URL
+func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
+	// Decrement active count when done
+	defer func() {
+		count := <-activeCount
+		activeCount <- count - 1
+	}()
+
+	// Check if we've already visited this URL (for crawling) or exceeded max depth
+	s.mu.Lock()
+	if s.visitedURLs[currentURL] || depth > s.depth {
+		s.mu.Unlock()
+		return
+	}
+	s.visitedURLs[currentURL] = true
+
+	// If we've already checked this URL's status, reuse the result
+	if result, exists := s.checkedURLs[currentURL]; exists {
+		// Always use the provided source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+
+		// We still need to parse HTML/CSS content even if we've checked the URL before
+		// But only if it was successful
+		if result.Error == "" && result.Status < 400 {
+			// Continue with content parsing...
+		} else {
+			return
+		}
+	} else {
+		s.mu.Unlock()
+	}
+
+	if s.verbose {
+		fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
+	}
+
+	// Parse the current URL
+	currentParsed, err := url.Parse(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "link",
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	// Determine if the URL is internal or external
+	isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""
+
+	// Skip external links processing if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Process external links differently from internal links
+	if isExternal {
+		s.checkExternalURL(currentURL, sourceURL, results)
+		return
+	}
+
+	// Internal URL, check and crawl
+	resp, err := s.client.Get(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	// Add the result
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	} else {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+	}
+
+	// Only parse HTML and CSS from internal links
+	contentType := resp.Header.Get("Content-Type")
+	if strings.Contains(contentType, "text/html") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
+		// Add all found URLs to the queue and increment active count
+		if len(foundURLs) > 0 {
+			count := <-activeCount
+			count += len(foundURLs)
+			activeCount <- count
+
+			for _, url := range foundURLs {
+				queue <- QueueItem{
+					URL:       url,
+					SourceURL: currentURL, // The source URL is the current page we're processing
+					Depth:     depth + 1,
+				}
+			}
+		}
+	} else if strings.Contains(contentType, "text/css") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
+	}
+}
+
+// parseHTML extracts links and other resources from HTML
+func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
+	foundURLs := []string{}
+
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		s.addResult(results, Result{
+			URL:        sourceURL,
+			SourceURL:  sourceURL, // Use self as source for error
+			Error:      fmt.Sprintf("Failed to parse HTML: %v", err),
+			Status:     resp.StatusCode,
+			Type:       "html",
+			IsExternal: false,
+		})
+		return foundURLs
+	}
+
+	// Process links (a href)
+	doc.Find("a").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				linkParsed, err := url.Parse(targetURL)
+				if err == nil {
+					isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""
+
+					// Only add internal links to the crawl queue
+					if !isExternal {
+						foundURLs = append(foundURLs, targetURL)
+					} else if !s.internalOnly {
+						// Check external links only if internalOnly is false
+						s.checkExternalURL(targetURL, sourceURL, results)
+					}
+				}
+			}
+		}
+	})
+
+	// Process images
+	doc.Find("img").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
+			}
+		}
+	})
+
+	// Process stylesheets
+	doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
+			}
+		}
+	})
+
+	// Process scripts
+	doc.Find("script").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
+			}
+		}
+	})
+
+	return foundURLs
+}
+
+// parseCSS extracts URLs from CSS content
+func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
+	// Simple regex-based parsing for CSS imports and url() references
+	// This is a simplified approach; a proper CSS parser would be better
+	// for production use
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Look for url() references
+		urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range urlMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
+				}
+			}
+		}
+
+		// Look for @import statements
+		importMatches := importRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range importMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
+				}
+			}
+		}
+	}
+}
+
+// resolveURL resolves a relative URL against a base URL
+func (s *Scraper) resolveURL(href, sourceURL string) string {
+	// Skip empty URLs, anchors, and javascript
+	if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
+		return ""
+	}
+
+	// Skip non-HTTP protocols like mailto:, tel:, etc.
+	if strings.HasPrefix(href, "mailto:") ||
+		strings.HasPrefix(href, "tel:") ||
+		strings.HasPrefix(href, "sms:") ||
+		strings.HasPrefix(href, "ftp:") ||
+		strings.HasPrefix(href, "file:") {
+		return ""
+	}
+
+	sourceParsed, err := url.Parse(sourceURL)
+	if err != nil {
+		return ""
+	}
+
+	targetParsed, err := url.Parse(href)
+	if err != nil {
+		return ""
+	}
+
+	// If the scheme is not HTTP/HTTPS, skip it
+	if targetParsed.Scheme != "" &&
+		targetParsed.Scheme != "http" &&
+		targetParsed.Scheme != "https" {
+		return ""
+	}
+
+	resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
+	return resolvedURL
+}
+
+// checkExternalURL performs a HEAD request to check external URLs
+func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
+	// Skip external links if internalOnly is set
+	if s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Add the existing result with the current source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// checkResource checks if a resource URL is accessible
+func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
+	// Parse the target URL to determine if it's internal or external
+	targetParsed, err := url.Parse(targetURL)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""
+
+	// Skip external resources if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Update with current source and type if needed
+		result.SourceURL = sourceURL
+		result.Type = resourceType
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// addResult adds a result to the appropriate list
+func (s *Scraper) addResult(results *Results, result Result) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if result.Error != "" {
+		results.Errors = append(results.Errors, result)
+	} else {
+		results.Successes = append(results.Successes, result)
+	}
+}
--- a/pkg/scraper/scraper_test.go
+++ b/pkg/scraper/scraper_test.go
@ -0,0 +1,163 @@
+package scraper
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"testing"
+)
+
+func TestResolveURL(t *testing.T) {
+	s := &Scraper{}
+
+	tests := []struct {
+		name       string
+		href       string
+		sourceURL  string
+		baseURL    string
+		wantResult string
+	}{
+		{
+			name:       "Absolute URL",
+			href:       "https://example.com/page.html",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "https://example.com/page.html",
+		},
+		{
+			name:       "Relative URL",
+			href:       "page.html",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "https://example.org/page.html",
+		},
+		{
+			name:       "Anchor link",
+			href:       "#section",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "",
+		},
+		{
+			name:       "JavaScript link",
+			href:       "javascript:void(0)",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := s.resolveURL(tt.href, tt.sourceURL)
+			if got != tt.wantResult {
+				t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
+			}
+		})
+	}
+}
+
+func TestNew(t *testing.T) {
+	s := New(
+		WithConcurrency(20),
+		WithDepth(5),
+		WithTimeout(30),
+		WithVerbose(true),
+	)
+
+	if s.concurrency != 20 {
+		t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
+	}
+
+	if s.depth != 5 {
+		t.Errorf("Expected depth to be 5, got %d", s.depth)
+	}
+
+	if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
+		t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
+	}
+
+	if !s.verbose {
+		t.Errorf("Expected verbose to be true")
+	}
+}
+
+func TestAddResult(t *testing.T) {
+	s := &Scraper{}
+	results := &Results{}
+
+	// Add an error result
+	errorResult := Result{
+		URL:   "https://example.com/error",
+		Error: "Test error",
+		Type:  "link",
+	}
+	s.addResult(results, errorResult)
+
+	if len(results.Errors) != 1 {
+		t.Errorf("Expected 1 error, got %d", len(results.Errors))
+	}
+
+	// Add a success result
+	successResult := Result{
+		URL:    "https://example.com/success",
+		Status: 200,
+		Type:   "link",
+	}
+	s.addResult(results, successResult)
+
+	if len(results.Successes) != 1 {
+		t.Errorf("Expected 1 success, got %d", len(results.Successes))
+	}
+}
+
+func TestProcessURL(t *testing.T) {
+	// Create a test server
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		if _, err := w.Write([]byte(`
+			<!DOCTYPE html>
+			<html>
+			<head>
+				<link rel="stylesheet" href="/style.css">
+				<script src="/script.js"></script>
+			</head>
+			<body>
+				<a href="/page1.html">Page 1</a>
+				<a href="https://example.com">External</a>
+				<img src="/image.jpg">
+			</body>
+			</html>
+		`)); err != nil {
+			t.Fatalf("Failed to write response: %v", err)
+		}
+	}))
+	defer server.Close()
+
+	s := New(WithDepth(1), WithConcurrency(1))
+	results := &Results{}
+
+	// Create a channel for QueueItems instead of strings
+	queue := make(chan QueueItem, 10)
+
+	// Create active count channel
+	activeCount := make(chan int, 1)
+	activeCount <- 1 // Start with one active URL
+
+	// Parse the server URL to get the hostname
+	serverURL, _ := url.Parse(server.URL)
+	baseHostname := serverURL.Hostname()
+
+	// Process the URL with the updated signature
+	s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
+
+	// Check that we found at least one success (the main page)
+	if len(results.Successes) < 1 {
+		t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
+	}
+
+	// Check that we queued some URLs for processing
+	if len(queue) < 1 {
+		t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
+	}
+}
--- a/pkg/scraper/testsite_test.go
+++ b/pkg/scraper/testsite_test.go
@ -0,0 +1,206 @@
+package scraper
+
+import (
+	"strings"
+	"testing"
+
+	"git.nakama.town/fmartingr/dharma/pkg/testutil"
+)
+
+func TestTestsiteIntegration(t *testing.T) {
+	// Skip if running short tests
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Start the testsite server
+	serverURL, cleanup, err := testutil.StartTestsiteServer()
+	if err != nil {
+		t.Fatalf("Failed to start test server: %v", err)
+	}
+	defer cleanup()
+
+	// Create a new scraper with default settings
+	s := New(
+		WithConcurrency(2),
+		WithDepth(3),
+		WithTimeout(5),
+		WithVerbose(false),
+		WithInternalOnly(true),
+	)
+
+	// Run the scraper
+	results, err := s.Scan(serverURL)
+	if err != nil {
+		t.Fatalf("Scraper.Scan failed: %v", err)
+	}
+
+	// Verify we have results
+	if results == nil {
+		t.Fatal("Expected results but got nil")
+	}
+
+	// Check that we have the correct base URL
+	if results.BaseURL != serverURL {
+		t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
+	}
+
+	// Test cases for specific URLs that should be found
+	expectedURLs := map[string]bool{
+		"/found.html":        true,
+		"/not_found.html":    false,
+		"/rel/index.html":    true,
+		"/rel/relfound.html": true,
+		"/static/style.css":  true,
+		"/static/script.js":  true,
+	}
+
+	// Check for expected URLs in the results
+	for urlPath, shouldExist := range expectedURLs {
+		fullURL := serverURL + urlPath
+		found := false
+
+		// Look in both successes and errors
+		for _, result := range results.Successes {
+			if result.URL == fullURL {
+				found = true
+				if !shouldExist {
+					t.Errorf("URL %s should not exist but was found in successes", urlPath)
+				}
+				break
+			}
+		}
+
+		if !found && shouldExist {
+			// If not found in successes, check if it's in errors
+			for _, result := range results.Errors {
+				if result.URL == fullURL {
+					found = true
+					t.Errorf("URL %s should exist but was found in errors", urlPath)
+					break
+				}
+			}
+
+			if !found {
+				t.Errorf("Expected URL %s was not found in results", urlPath)
+			}
+		}
+	}
+
+	// Check that not_found.html is in errors
+	notFoundURL := serverURL + "/not_found.html"
+	foundInErrors := false
+	for _, result := range results.Errors {
+		if result.URL == notFoundURL {
+			foundInErrors = true
+			break
+		}
+	}
+
+	if !foundInErrors {
+		t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
+	}
+
+	// Verify relative links in the rel directory
+	relNotFoundURL := serverURL + "/rel/rel_not_found.html"
+	foundRelNotFound := false
+	for _, result := range results.Errors {
+		if result.URL == relNotFoundURL {
+			foundRelNotFound = true
+			break
+		}
+	}
+
+	if !foundRelNotFound {
+		t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
+	}
+
+	// Check for missing image
+	missingImageURL := serverURL + "/rel/image-404.jpg"
+	foundMissingImage := false
+	for _, result := range results.Errors {
+		if result.URL == missingImageURL {
+			foundMissingImage = true
+			break
+		}
+	}
+
+	if !foundMissingImage {
+		t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
+	}
+
+	// Check for external links
+	externalLinkCount := 0
+	for _, result := range results.Successes {
+		if strings.Contains(result.URL, "fmartingr.com") {
+			externalLinkCount++
+		}
+	}
+
+	if externalLinkCount != 0 {
+		t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
+	}
+
+	// Verify total count
+	expectedTotal := len(results.Successes) + len(results.Errors)
+	if results.Total != expectedTotal {
+		t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
+	}
+}
+
+func TestTestsiteWithExternalLinks(t *testing.T) {
+	// Skip if running short tests
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Start the testsite server
+	serverURL, cleanup, err := testutil.StartTestsiteServer()
+	if err != nil {
+		t.Fatalf("Failed to start test server: %v", err)
+	}
+	defer cleanup()
+
+	// Create a new scraper with external links allowed
+	s := New(
+		WithConcurrency(2),
+		WithDepth(1), // Lower depth for external links test
+		WithTimeout(5),
+		WithVerbose(false),
+		WithInternalOnly(false), // Allow external links
+	)
+
+	// Run the scraper
+	results, err := s.Scan(serverURL)
+	if err != nil {
+		t.Fatalf("Scraper.Scan failed: %v", err)
+	}
+
+	// Check for external links - we should find at least one external link
+	foundExternalLinks := false
+	brokenExternalLinks := false
+
+	for _, result := range results.Successes {
+		if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
+			foundExternalLinks = true
+			break
+		}
+	}
+
+	for _, result := range results.Errors {
+		if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
+			brokenExternalLinks = true
+			break
+		}
+	}
+
+	// We don't actually hit external URLs in tests, so we can't assert on them,
+	// but we can check that they're properly identified
+	if !foundExternalLinks {
+		t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
+	}
+
+	if !brokenExternalLinks {
+		t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
+	}
+}