initial release

2025-05-04 10:40:26 +02:00 · 2025-05-04 10:40:26 +02:00 · 0ef15167d5
commit 0ef15167d5
28 changed files with 2789 additions and 0 deletions
--- a/pkg/cli/root.go
+++ b/pkg/cli/root.go
@ -0,0 +1,77 @@
+package cli
+
+import (
+	"fmt"
+	"os"
+
+	"git.nakama.town/fmartingr/dharma/pkg/reporter"
+	"git.nakama.town/fmartingr/dharma/pkg/scraper"
+	"github.com/spf13/cobra"
+)
+
+var (
+	format       string
+	concurrency  int
+	depth        int
+	timeout      int
+	verbose      bool
+	internalOnly bool
+)
+
+// rootCmd represents the base command when called without any subcommands
+var rootCmd = &cobra.Command{
+	Use:   "dharma [URL]",
+	Short: "Scrape websites and check for broken links and references",
+	Long: `Dharma is a website link checker tool that crawls a website to find broken links,
+images, CSS references, and more. It generates a report of all issues found.`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		url := args[0]
+
+		// Only print status message when using pretty format
+		if format == "pretty" {
+			fmt.Printf("Scanning website: %s\n", url)
+		} else {
+			// Force verbose off for non-pretty formats
+			verbose = false
+		}
+
+		// Create a new scraper
+		s := scraper.New(
+			scraper.WithConcurrency(concurrency),
+			scraper.WithDepth(depth),
+			scraper.WithTimeout(timeout),
+			scraper.WithVerbose(verbose),
+			scraper.WithInternalOnly(internalOnly),
+		)
+
+		// Run the scraper
+		results, err := s.Scan(url)
+		if err != nil {
+			return err
+		}
+
+		// Generate report
+		r, err := reporter.New(format)
+		if err != nil {
+			return err
+		}
+
+		return r.Generate(results, os.Stdout)
+	},
+}
+
+// Execute adds all child commands to the root command and sets flags appropriately.
+// This is called by main.main(). It only needs to happen once to the rootCmd.
+func Execute() error {
+	return rootCmd.Execute()
+}
+
+func init() {
+	rootCmd.Flags().StringVarP(&format, "format", "f", "pretty", "Output format (pretty, json, csv)")
+	rootCmd.Flags().IntVarP(&concurrency, "concurrency", "c", 10, "Number of concurrent requests")
+	rootCmd.Flags().IntVarP(&depth, "depth", "d", 3, "Maximum depth to crawl")
+	rootCmd.Flags().IntVarP(&timeout, "timeout", "t", 10, "Timeout in seconds for each request")
+	rootCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output")
+	rootCmd.Flags().BoolVarP(&internalOnly, "internal-only", "i", false, "Only check internal links (same hostname)")
+}
--- a/pkg/cli/testsite_test.go
+++ b/pkg/cli/testsite_test.go
@ -0,0 +1,196 @@
+package cli
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"strings"
+	"testing"
+
+	"git.nakama.town/fmartingr/dharma/pkg/reporter"
+	"git.nakama.town/fmartingr/dharma/pkg/scraper"
+	"git.nakama.town/fmartingr/dharma/pkg/testutil"
+	"github.com/spf13/cobra"
+)
+
+func TestIntegrationTestsite(t *testing.T) {
+	// Skip this test if running in CI environment or if it's a short test run
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Start the testsite server
+	serverURL, cleanup, err := testutil.StartTestsiteServer()
+	if err != nil {
+		t.Fatalf("Failed to start test server: %v", err)
+	}
+	defer cleanup()
+
+	// Test cases for CLI invocation with different flags
+	testCases := []struct {
+		name       string
+		args       []string
+		wantOutput bool
+		wantErrors bool
+	}{
+		{
+			name:       "Basic scan",
+			args:       []string{serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "JSON output",
+			args:       []string{"--format", "json", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "Internal links only",
+			args:       []string{"--internal-only", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "Custom depth",
+			args:       []string{"--depth", "1", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "Custom concurrency",
+			args:       []string{"--concurrency", "1", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "Custom timeout",
+			args:       []string{"--timeout", "5", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+		{
+			name:       "CSV output",
+			args:       []string{"--format", "csv", serverURL},
+			wantOutput: true,
+			wantErrors: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Temporarily replace stdout to capture output
+			oldStdout := os.Stdout
+			r, w, _ := os.Pipe()
+			os.Stdout = w
+
+			// Save and restore rootCmd so state doesn't leak between test cases
+			origRootCmd := rootCmd
+			defer func() {
+				rootCmd = origRootCmd
+				os.Stdout = oldStdout
+			}()
+
+			// Reset rootCmd for this test
+			rootCmd = createRootCmd()
+
+			// Set the command line arguments
+			os.Args = append([]string{"dharma"}, tc.args...)
+
+			// Execute the CLI
+			err := Execute()
+			if err != nil {
+				t.Fatalf("Failed to execute command: %v", err)
+			}
+
+			// Restore stdout and read the output
+			w.Close()
+			var buf bytes.Buffer
+			_, err = buf.ReadFrom(r)
+			if err != nil {
+				t.Fatalf("Failed to read output: %v", err)
+			}
+			output := buf.String()
+
+			// Check if we got any output
+			if tc.wantOutput && output == "" {
+				t.Errorf("Expected output but got none")
+			}
+
+			// Check if errors were reported for known broken links
+			if tc.wantErrors {
+				if !strings.Contains(output, "not_found.html") {
+					t.Errorf("Expected not_found.html to be reported as broken in output")
+				}
+			}
+
+			// Specific checks for different formats
+			if strings.Contains(tc.name, "JSON") {
+				if !strings.Contains(output, `"url":`) {
+					t.Errorf("Expected JSON output with 'url' field")
+				}
+			} else if strings.Contains(tc.name, "CSV") {
+				if !strings.Contains(output, "Status,Type,URL,Source URL,Error") {
+					t.Errorf("Expected CSV header in output")
+				}
+			}
+		})
+	}
+}
+
+// createRootCmd returns a fresh instance of the root command
+func createRootCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "dharma [URL]",
+		Short: "Scrape websites and check for broken links and references",
+		Long:  `Dharma is a website link checker tool that crawls a website to find broken links, images, CSS references, and more. It generates a report of all issues found.`,
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			url := args[0]
+
+			// Only print status message when using pretty format
+			if format == "pretty" {
+				if !strings.HasPrefix(url, "file://") {
+					fmt.Printf("Scanning website: %s\n", url)
+				} else {
+					fmt.Printf("Scanning local directory: %s\n", strings.TrimPrefix(url, "file://"))
+				}
+			} else {
+				// Force verbose off for non-pretty formats
+				verbose = false
+			}
+
+			// Create a new scraper
+			s := scraper.New(
+				scraper.WithConcurrency(concurrency),
+				scraper.WithDepth(depth),
+				scraper.WithTimeout(timeout),
+				scraper.WithVerbose(verbose),
+				scraper.WithInternalOnly(internalOnly),
+			)
+
+			// Run the scraper
+			results, err := s.Scan(url)
+			if err != nil {
+				return err
+			}
+
+			// Generate report
+			r, err := reporter.New(format)
+			if err != nil {
+				return err
+			}
+
+			return r.Generate(results, os.Stdout)
+		},
+	}
+
+	cmd.Flags().StringVarP(&format, "format", "f", "pretty", "Output format (pretty, json, csv)")
+	cmd.Flags().IntVarP(&concurrency, "concurrency", "c", 10, "Number of concurrent requests")
+	cmd.Flags().IntVarP(&depth, "depth", "d", 3, "Maximum depth to crawl")
+	cmd.Flags().IntVarP(&timeout, "timeout", "t", 10, "Timeout in seconds for each request")
+	cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output")
+	cmd.Flags().BoolVarP(&internalOnly, "internal-only", "i", false, "Only check internal links (same hostname)")
+
+	return cmd
+}
--- a/pkg/reporter/reporter.go
+++ b/pkg/reporter/reporter.go
@ -0,0 +1,192 @@
+package reporter
+
+import (
+	"encoding/csv"
+	"encoding/json"
+	"fmt"
+	"io"
+	"strings"
+	"time"
+
+	"git.nakama.town/fmartingr/dharma/pkg/scraper"
+	"github.com/fatih/color"
+)
+
+// Reporter is an interface for report generators
+type Reporter interface {
+	Generate(results *scraper.Results, writer io.Writer) error
+}
+
+// New creates a new reporter based on the format
+func New(format string) (Reporter, error) {
+	switch strings.ToLower(format) {
+	case "pretty":
+		return &PrettyReporter{}, nil
+	case "json":
+		return &JSONReporter{}, nil
+	case "csv":
+		return &CSVReporter{}, nil
+	default:
+		return nil, fmt.Errorf("unsupported format: %s", format)
+	}
+}
+
+// PrettyReporter generates a human-readable report for terminal
+type PrettyReporter struct{}
+
+// Generate generates a pretty report
+func (r *PrettyReporter) Generate(results *scraper.Results, writer io.Writer) error {
+	red := color.New(color.FgRed).SprintFunc()
+	green := color.New(color.FgGreen).SprintFunc()
+	yellow := color.New(color.FgYellow).SprintFunc()
+	blue := color.New(color.FgBlue).SprintFunc()
+	cyan := color.New(color.FgCyan).SprintFunc()
+
+	// Count internal vs external links
+	countInternalSuccess := 0
+	countInternalErrors := 0
+	countExternalSuccess := 0
+	countExternalErrors := 0
+
+	for _, result := range results.Successes {
+		if result.IsExternal {
+			countExternalSuccess++
+		} else {
+			countInternalSuccess++
+		}
+	}
+
+	for _, result := range results.Errors {
+		if result.IsExternal {
+			countExternalErrors++
+		} else {
+			countInternalErrors++
+		}
+	}
+
+	fmt.Fprintf(writer, "Website scan report for: %s\n", blue(results.BaseURL))
+	fmt.Fprintf(writer, "Scanned at: %s\n", time.Now().Format(time.RFC1123))
+	fmt.Fprintf(writer, "Total resources checked: %d\n", results.Total)
+	fmt.Fprintf(writer, "Success: %s, Errors: %s\n",
+		green(len(results.Successes)),
+		red(len(results.Errors)))
+	fmt.Fprintf(writer, "Internal links: %s success, %s errors\n",
+		green(countInternalSuccess),
+		red(countInternalErrors))
+	fmt.Fprintf(writer, "External links: %s success, %s errors\n\n",
+		green(countExternalSuccess),
+		red(countExternalErrors))
+
+	if len(results.Errors) == 0 {
+		fmt.Fprintf(writer, "%s No errors found!\n", green("✓"))
+		return nil
+	}
+
+	// Group errors by internal/external
+	internalErrors := []scraper.Result{}
+	externalErrors := []scraper.Result{}
+
+	for _, result := range results.Errors {
+		if result.IsExternal {
+			externalErrors = append(externalErrors, result)
+		} else {
+			internalErrors = append(internalErrors, result)
+		}
+	}
+
+	// Print internal errors first if we have any
+	if len(internalErrors) > 0 {
+		fmt.Fprintln(writer, "Errors found:")
+
+		for _, result := range internalErrors {
+			status := fmt.Sprintf("%d", result.Status)
+			if result.Status == 0 {
+				status = "ERR"
+			}
+
+			fmt.Fprintf(writer, "%-6s  (%-10s) %s [from: %s]\n",
+				red(status),
+				yellow(result.Type),
+				result.URL,
+				result.SourceURL,
+			)
+		}
+	}
+
+	// Print external errors if we have any
+	if len(externalErrors) > 0 {
+		if len(internalErrors) > 0 {
+			fmt.Fprintln(writer, "")
+		}
+		fmt.Fprintln(writer, "External Errors:")
+		fmt.Fprintln(writer, strings.Repeat("-", 80))
+		fmt.Fprintf(writer, "%-6s | %-10s | %s | %s\n", "Status", "Type", "URL", "Source")
+		fmt.Fprintln(writer, strings.Repeat("-", 80))
+
+		for _, result := range externalErrors {
+			status := fmt.Sprintf("%d", result.Status)
+			if result.Status == 0 {
+				status = "ERR"
+			}
+
+			fmt.Fprintf(writer, "%-6s | %-10s | %s | %s\n",
+				red(status),
+				cyan(result.Type),
+				result.URL,
+				result.SourceURL,
+			)
+		}
+	}
+
+	return nil
+}
+
+// JSONReporter generates a JSON report
+type JSONReporter struct{}
+
+// Generate generates a JSON report
+func (r *JSONReporter) Generate(results *scraper.Results, writer io.Writer) error {
+	return json.NewEncoder(writer).Encode(results)
+}
+
+// CSVReporter generates a CSV report
+type CSVReporter struct{}
+
+// Generate generates a CSV report
+func (r *CSVReporter) Generate(results *scraper.Results, writer io.Writer) error {
+	csvWriter := csv.NewWriter(writer)
+	defer csvWriter.Flush()
+
+	// Write header
+	if err := csvWriter.Write([]string{"Status", "Type", "URL", "Source URL", "Error"}); err != nil {
+		return err
+	}
+
+	// Write errors
+	for _, result := range results.Errors {
+		status := fmt.Sprintf("%d", result.Status)
+		if result.Status == 0 {
+			status = "ERROR"
+		}
+
+		if err := csvWriter.Write([]string{
+			status,
+			result.Type,
+			result.URL,
+			result.SourceURL,
+			result.Error,
+		}); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Helper function to truncate strings
+func truncate(s string, maxLen int) string {
+	if len(s) <= maxLen {
+		return s
+	}
+	return s[:maxLen-3] + "..."
+}
--- a/pkg/reporter/reporter_test.go
+++ b/pkg/reporter/reporter_test.go
@ -0,0 +1,206 @@
+package reporter
+
+import (
+	"bytes"
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"git.nakama.town/fmartingr/dharma/pkg/scraper"
+)
+
+func TestNew(t *testing.T) {
+	tests := []struct {
+		name    string
+		format  string
+		wantErr bool
+	}{
+		{
+			name:    "Pretty format",
+			format:  "pretty",
+			wantErr: false,
+		},
+		{
+			name:    "JSON format",
+			format:  "json",
+			wantErr: false,
+		},
+		{
+			name:    "CSV format",
+			format:  "csv",
+			wantErr: false,
+		},
+		{
+			name:    "Unsupported format",
+			format:  "xml",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := New(tt.format)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("New() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !tt.wantErr && got == nil {
+				t.Errorf("New() = nil, want non-nil")
+			}
+		})
+	}
+}
+
+func TestJSONReporter_Generate(t *testing.T) {
+	// Create test results
+	results := &scraper.Results{
+		BaseURL: "https://example.com",
+		Errors: []scraper.Result{
+			{
+				URL:       "https://example.com/error",
+				SourceURL: "https://example.com",
+				Status:    404,
+				Error:     "HTTP Error: 404 Not Found",
+				Type:      "link",
+			},
+		},
+		Successes: []scraper.Result{
+			{
+				URL:       "https://example.com/success",
+				SourceURL: "https://example.com",
+				Status:    200,
+				Type:      "link",
+			},
+		},
+		Total: 2,
+	}
+
+	// Create reporter and buffer
+	reporter := &JSONReporter{}
+	buf := &bytes.Buffer{}
+
+	// Generate report
+	if err := reporter.Generate(results, buf); err != nil {
+		t.Fatalf("Generate() error = %v", err)
+	}
+
+	// Parse output
+	var output scraper.Results
+	if err := json.Unmarshal(buf.Bytes(), &output); err != nil {
+		t.Fatalf("Failed to parse JSON: %v", err)
+	}
+
+	// Verify output
+	if output.BaseURL != results.BaseURL {
+		t.Errorf("BaseURL = %v, want %v", output.BaseURL, results.BaseURL)
+	}
+	if len(output.Errors) != len(results.Errors) {
+		t.Errorf("Errors count = %v, want %v", len(output.Errors), len(results.Errors))
+	}
+	if len(output.Successes) != len(results.Successes) {
+		t.Errorf("Successes count = %v, want %v", len(output.Successes), len(results.Successes))
+	}
+	if output.Total != results.Total {
+		t.Errorf("Total = %v, want %v", output.Total, results.Total)
+	}
+}
+
+func TestCSVReporter_Generate(t *testing.T) {
+	// Create test results
+	results := &scraper.Results{
+		BaseURL: "https://example.com",
+		Errors: []scraper.Result{
+			{
+				URL:       "https://example.com/error",
+				SourceURL: "https://example.com",
+				Status:    404,
+				Error:     "HTTP Error: 404 Not Found",
+				Type:      "link",
+			},
+		},
+		Successes: []scraper.Result{},
+		Total:     1,
+	}
+
+	// Create reporter and buffer
+	reporter := &CSVReporter{}
+	buf := &bytes.Buffer{}
+
+	// Generate report
+	if err := reporter.Generate(results, buf); err != nil {
+		t.Fatalf("Generate() error = %v", err)
+	}
+
+	// Verify output
+	lines := strings.Split(strings.TrimSpace(buf.String()), "\n")
+	if len(lines) != 2 { // Header + 1 error
+		t.Errorf("Expected 2 lines, got %d", len(lines))
+	}
+
+	// Check header
+	expectedHeader := "Status,Type,URL,Source URL,Error"
+	if lines[0] != expectedHeader {
+		t.Errorf("Header = %v, want %v", lines[0], expectedHeader)
+	}
+}
+
+func TestPrettyReporter_Generate(t *testing.T) {
+	// Test with errors
+	results := &scraper.Results{
+		BaseURL: "https://example.com",
+		Errors: []scraper.Result{
+			{
+				URL:       "https://example.com/error",
+				SourceURL: "https://example.com",
+				Status:    404,
+				Error:     "HTTP Error: 404 Not Found",
+				Type:      "link",
+			},
+		},
+		Successes: []scraper.Result{
+			{
+				URL:       "https://example.com/success",
+				SourceURL: "https://example.com",
+				Status:    200,
+				Type:      "link",
+			},
+		},
+		Total: 2,
+	}
+
+	// Create reporter and buffer
+	reporter := &PrettyReporter{}
+	buf := &bytes.Buffer{}
+
+	// Generate report
+	if err := reporter.Generate(results, buf); err != nil {
+		t.Fatalf("Generate() error = %v", err)
+	}
+
+	// Check that output contains key sections
+	output := buf.String()
+	if !strings.Contains(output, "Website scan report for") {
+		t.Error("Output doesn't contain report title")
+	}
+	if !strings.Contains(output, "Internal Errors:") {
+		t.Error("Output doesn't contain errors section")
+	}
+
+	// Test with no errors
+	results = &scraper.Results{
+		BaseURL:   "https://example.com",
+		Errors:    []scraper.Result{},
+		Successes: []scraper.Result{},
+		Total:     0,
+	}
+
+	buf = &bytes.Buffer{}
+	if err := reporter.Generate(results, buf); err != nil {
+		t.Fatalf("Generate() error = %v", err)
+	}
+
+	output = buf.String()
+	if !strings.Contains(output, "No errors found") {
+		t.Error("Output doesn't contain 'No errors found' message")
+	}
+}
--- a/pkg/scraper/scraper.go
+++ b/pkg/scraper/scraper.go
@ -0,0 +1,669 @@
+package scraper
+
+import (
+	"bufio"
+	"fmt"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+var (
+	urlRegexp    = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
+	importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
+)
+
+// Result represents a URL check result
+type Result struct {
+	URL        string `json:"url"`
+	SourceURL  string `json:"source_url,omitempty"`
+	Status     int    `json:"status"`
+	Error      string `json:"error,omitempty"`
+	Type       string `json:"type"` // link, image, script, stylesheet, css-import
+	IsExternal bool   `json:"is_external"`
+}
+
+// Results is a collection of Result
+type Results struct {
+	BaseURL   string   `json:"base_url"`
+	Errors    []Result `json:"errors"`
+	Successes []Result `json:"successes"`
+	Total     int      `json:"total"`
+}
+
+// QueueItem represents a URL to be processed along with its source
+type QueueItem struct {
+	URL       string
+	SourceURL string
+	Depth     int
+}
+
+// Scraper handles website crawling and link checking
+type Scraper struct {
+	client       *http.Client
+	concurrency  int
+	depth        int
+	verbose      bool
+	internalOnly bool
+	visitedURLs  map[string]bool   // URLs visited for crawling
+	checkedURLs  map[string]Result // URLs already checked to avoid duplicate requests
+	mu           sync.Mutex
+}
+
+// Option is a function option for the Scraper
+type Option func(*Scraper)
+
+// WithConcurrency sets the concurrency level
+func WithConcurrency(concurrency int) Option {
+	return func(s *Scraper) {
+		s.concurrency = concurrency
+	}
+}
+
+// WithDepth sets the maximum crawling depth
+func WithDepth(depth int) Option {
+	return func(s *Scraper) {
+		s.depth = depth
+	}
+}
+
+// WithTimeout sets the timeout for HTTP requests
+func WithTimeout(timeoutSec int) Option {
+	return func(s *Scraper) {
+		s.client.Timeout = time.Duration(timeoutSec) * time.Second
+	}
+}
+
+// WithVerbose enables verbose output
+func WithVerbose(verbose bool) Option {
+	return func(s *Scraper) {
+		s.verbose = verbose
+	}
+}
+
+// WithInternalOnly sets whether to only check internal links
+func WithInternalOnly(internalOnly bool) Option {
+	return func(s *Scraper) {
+		s.internalOnly = internalOnly
+	}
+}
+
+// New creates a new Scraper with the given options
+func New(options ...Option) *Scraper {
+	s := &Scraper{
+		client: &http.Client{
+			Timeout: 10 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		concurrency: 10,
+		depth:       3,
+		visitedURLs: make(map[string]bool),
+		checkedURLs: make(map[string]Result),
+	}
+
+	for _, option := range options {
+		option(s)
+	}
+
+	return s
+}
+
+// Scan starts the website crawling process
+func (s *Scraper) Scan(baseURL string) (*Results, error) {
+	parsedURL, err := url.Parse(baseURL)
+	if err != nil {
+		return nil, fmt.Errorf("invalid URL: %w", err)
+	}
+
+	// Ensure the base URL has a scheme
+	if parsedURL.Scheme == "" {
+		parsedURL.Scheme = "https"
+		baseURL = parsedURL.String()
+	}
+
+	// Store the base hostname for distinguishing internal vs external links
+	baseHostname := parsedURL.Hostname()
+
+	results := &Results{
+		BaseURL: baseURL,
+	}
+
+	// Create a waitgroup to track active workers
+	var wg sync.WaitGroup
+
+	// Create a channel to communicate URLs to process
+	queue := make(chan QueueItem, 1000)
+
+	// Create a channel to track active URL processing
+	activeCount := make(chan int, 1)
+	activeCount <- 1 // Start with 1 active URL (the base URL)
+
+	// Start worker pool
+	for range s.concurrency {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for item := range queue {
+				s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
+			}
+		}()
+	}
+
+	// Initial URL to process - the source is itself for the initial URL
+	queue <- QueueItem{
+		URL:       baseURL,
+		SourceURL: baseURL,
+		Depth:     0,
+	}
+
+	// Monitor active count - when it reaches 0, we're done
+	go func() {
+		for {
+			count := <-activeCount
+			if count <= 0 {
+				close(queue)
+				return
+			}
+			activeCount <- count
+		}
+	}()
+
+	// Wait for workers to finish
+	wg.Wait()
+
+	results.Total = len(results.Errors) + len(results.Successes)
+	return results, nil
+}
+
+// processURL processes a single URL
+func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
+	// Decrement active count when done
+	defer func() {
+		count := <-activeCount
+		activeCount <- count - 1
+	}()
+
+	// Check if we've already visited this URL (for crawling) or exceeded max depth
+	s.mu.Lock()
+	if s.visitedURLs[currentURL] || depth > s.depth {
+		s.mu.Unlock()
+		return
+	}
+	s.visitedURLs[currentURL] = true
+
+	// If we've already checked this URL's status, reuse the result
+	if result, exists := s.checkedURLs[currentURL]; exists {
+		// Always use the provided source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+
+		// We still need to parse HTML/CSS content even if we've checked the URL before
+		// But only if it was successful
+		if result.Error == "" && result.Status < 400 {
+			// Continue with content parsing...
+		} else {
+			return
+		}
+	} else {
+		s.mu.Unlock()
+	}
+
+	if s.verbose {
+		fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
+	}
+
+	// Parse the current URL
+	currentParsed, err := url.Parse(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "link",
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	// Determine if the URL is internal or external
+	isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""
+
+	// Skip external links processing if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Process external links differently from internal links
+	if isExternal {
+		s.checkExternalURL(currentURL, sourceURL, results)
+		return
+	}
+
+	// Internal URL, check and crawl
+	resp, err := s.client.Get(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	// Add the result
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	} else {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+	}
+
+	// Only parse HTML and CSS from internal links
+	contentType := resp.Header.Get("Content-Type")
+	if strings.Contains(contentType, "text/html") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
+		// Add all found URLs to the queue and increment active count
+		if len(foundURLs) > 0 {
+			count := <-activeCount
+			count += len(foundURLs)
+			activeCount <- count
+
+			for _, url := range foundURLs {
+				queue <- QueueItem{
+					URL:       url,
+					SourceURL: currentURL, // The source URL is the current page we're processing
+					Depth:     depth + 1,
+				}
+			}
+		}
+	} else if strings.Contains(contentType, "text/css") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
+	}
+}
+
+// parseHTML extracts links and other resources from HTML
+func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
+	foundURLs := []string{}
+
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		s.addResult(results, Result{
+			URL:        sourceURL,
+			SourceURL:  sourceURL, // Use self as source for error
+			Error:      fmt.Sprintf("Failed to parse HTML: %v", err),
+			Status:     resp.StatusCode,
+			Type:       "html",
+			IsExternal: false,
+		})
+		return foundURLs
+	}
+
+	// Process links (a href)
+	doc.Find("a").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				linkParsed, err := url.Parse(targetURL)
+				if err == nil {
+					isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""
+
+					// Only add internal links to the crawl queue
+					if !isExternal {
+						foundURLs = append(foundURLs, targetURL)
+					} else if !s.internalOnly {
+						// Check external links only if internalOnly is false
+						s.checkExternalURL(targetURL, sourceURL, results)
+					}
+				}
+			}
+		}
+	})
+
+	// Process images
+	doc.Find("img").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
+			}
+		}
+	})
+
+	// Process stylesheets
+	doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
+			}
+		}
+	})
+
+	// Process scripts
+	doc.Find("script").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
+			}
+		}
+	})
+
+	return foundURLs
+}
+
+// parseCSS extracts URLs from CSS content
+func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
+	// Simple regex-based parsing for CSS imports and url() references
+	// This is a simplified approach; a proper CSS parser would be better
+	// for production use
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Look for url() references
+		urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range urlMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
+				}
+			}
+		}
+
+		// Look for @import statements
+		importMatches := importRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range importMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
+				}
+			}
+		}
+	}
+}
+
+// resolveURL resolves a relative URL against a base URL
+func (s *Scraper) resolveURL(href, sourceURL string) string {
+	// Skip empty URLs, anchors, and javascript
+	if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
+		return ""
+	}
+
+	// Skip non-HTTP protocols like mailto:, tel:, etc.
+	if strings.HasPrefix(href, "mailto:") ||
+		strings.HasPrefix(href, "tel:") ||
+		strings.HasPrefix(href, "sms:") ||
+		strings.HasPrefix(href, "ftp:") ||
+		strings.HasPrefix(href, "file:") {
+		return ""
+	}
+
+	sourceParsed, err := url.Parse(sourceURL)
+	if err != nil {
+		return ""
+	}
+
+	targetParsed, err := url.Parse(href)
+	if err != nil {
+		return ""
+	}
+
+	// If the scheme is not HTTP/HTTPS, skip it
+	if targetParsed.Scheme != "" &&
+		targetParsed.Scheme != "http" &&
+		targetParsed.Scheme != "https" {
+		return ""
+	}
+
+	resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
+	return resolvedURL
+}
+
+// checkExternalURL performs a HEAD request to check external URLs
+func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
+	// Skip external links if internalOnly is set
+	if s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Add the existing result with the current source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// checkResource checks if a resource URL is accessible
+func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
+	// Parse the target URL to determine if it's internal or external
+	targetParsed, err := url.Parse(targetURL)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""
+
+	// Skip external resources if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Update with current source and type if needed
+		result.SourceURL = sourceURL
+		result.Type = resourceType
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// addResult adds a result to the appropriate list
+func (s *Scraper) addResult(results *Results, result Result) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if result.Error != "" {
+		results.Errors = append(results.Errors, result)
+	} else {
+		results.Successes = append(results.Successes, result)
+	}
+}
--- a/pkg/scraper/scraper_test.go
+++ b/pkg/scraper/scraper_test.go
@ -0,0 +1,163 @@
+package scraper
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"testing"
+)
+
+func TestResolveURL(t *testing.T) {
+	s := &Scraper{}
+
+	tests := []struct {
+		name       string
+		href       string
+		sourceURL  string
+		baseURL    string
+		wantResult string
+	}{
+		{
+			name:       "Absolute URL",
+			href:       "https://example.com/page.html",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "https://example.com/page.html",
+		},
+		{
+			name:       "Relative URL",
+			href:       "page.html",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "https://example.org/page.html",
+		},
+		{
+			name:       "Anchor link",
+			href:       "#section",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "",
+		},
+		{
+			name:       "JavaScript link",
+			href:       "javascript:void(0)",
+			sourceURL:  "https://example.org/index.html",
+			baseURL:    "https://example.org/",
+			wantResult: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := s.resolveURL(tt.href, tt.sourceURL)
+			if got != tt.wantResult {
+				t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
+			}
+		})
+	}
+}
+
+func TestNew(t *testing.T) {
+	s := New(
+		WithConcurrency(20),
+		WithDepth(5),
+		WithTimeout(30),
+		WithVerbose(true),
+	)
+
+	if s.concurrency != 20 {
+		t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
+	}
+
+	if s.depth != 5 {
+		t.Errorf("Expected depth to be 5, got %d", s.depth)
+	}
+
+	if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
+		t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
+	}
+
+	if !s.verbose {
+		t.Errorf("Expected verbose to be true")
+	}
+}
+
+func TestAddResult(t *testing.T) {
+	s := &Scraper{}
+	results := &Results{}
+
+	// Add an error result
+	errorResult := Result{
+		URL:   "https://example.com/error",
+		Error: "Test error",
+		Type:  "link",
+	}
+	s.addResult(results, errorResult)
+
+	if len(results.Errors) != 1 {
+		t.Errorf("Expected 1 error, got %d", len(results.Errors))
+	}
+
+	// Add a success result
+	successResult := Result{
+		URL:    "https://example.com/success",
+		Status: 200,
+		Type:   "link",
+	}
+	s.addResult(results, successResult)
+
+	if len(results.Successes) != 1 {
+		t.Errorf("Expected 1 success, got %d", len(results.Successes))
+	}
+}
+
+func TestProcessURL(t *testing.T) {
+	// Create a test server
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		if _, err := w.Write([]byte(`
+			<!DOCTYPE html>
+			<html>
+			<head>
+				<link rel="stylesheet" href="/style.css">
+				<script src="/script.js"></script>
+			</head>
+			<body>
+				<a href="/page1.html">Page 1</a>
+				<a href="https://example.com">External</a>
+				<img src="/image.jpg">
+			</body>
+			</html>
+		`)); err != nil {
+			t.Fatalf("Failed to write response: %v", err)
+		}
+	}))
+	defer server.Close()
+
+	s := New(WithDepth(1), WithConcurrency(1))
+	results := &Results{}
+
+	// Create a channel for QueueItems instead of strings
+	queue := make(chan QueueItem, 10)
+
+	// Create active count channel
+	activeCount := make(chan int, 1)
+	activeCount <- 1 // Start with one active URL
+
+	// Parse the server URL to get the hostname
+	serverURL, _ := url.Parse(server.URL)
+	baseHostname := serverURL.Hostname()
+
+	// Process the URL with the updated signature
+	s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
+
+	// Check that we found at least one success (the main page)
+	if len(results.Successes) < 1 {
+		t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
+	}
+
+	// Check that we queued some URLs for processing
+	if len(queue) < 1 {
+		t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
+	}
+}
--- a/pkg/scraper/testsite_test.go
+++ b/pkg/scraper/testsite_test.go
@ -0,0 +1,206 @@
+package scraper
+
+import (
+	"strings"
+	"testing"
+
+	"git.nakama.town/fmartingr/dharma/pkg/testutil"
+)
+
+func TestTestsiteIntegration(t *testing.T) {
+	// Skip if running short tests
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Start the testsite server
+	serverURL, cleanup, err := testutil.StartTestsiteServer()
+	if err != nil {
+		t.Fatalf("Failed to start test server: %v", err)
+	}
+	defer cleanup()
+
+	// Create a new scraper with default settings
+	s := New(
+		WithConcurrency(2),
+		WithDepth(3),
+		WithTimeout(5),
+		WithVerbose(false),
+		WithInternalOnly(true),
+	)
+
+	// Run the scraper
+	results, err := s.Scan(serverURL)
+	if err != nil {
+		t.Fatalf("Scraper.Scan failed: %v", err)
+	}
+
+	// Verify we have results
+	if results == nil {
+		t.Fatal("Expected results but got nil")
+	}
+
+	// Check that we have the correct base URL
+	if results.BaseURL != serverURL {
+		t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
+	}
+
+	// Test cases for specific URLs that should be found
+	expectedURLs := map[string]bool{
+		"/found.html":        true,
+		"/not_found.html":    false,
+		"/rel/index.html":    true,
+		"/rel/relfound.html": true,
+		"/static/style.css":  true,
+		"/static/script.js":  true,
+	}
+
+	// Check for expected URLs in the results
+	for urlPath, shouldExist := range expectedURLs {
+		fullURL := serverURL + urlPath
+		found := false
+
+		// Look in both successes and errors
+		for _, result := range results.Successes {
+			if result.URL == fullURL {
+				found = true
+				if !shouldExist {
+					t.Errorf("URL %s should not exist but was found in successes", urlPath)
+				}
+				break
+			}
+		}
+
+		if !found && shouldExist {
+			// If not found in successes, check if it's in errors
+			for _, result := range results.Errors {
+				if result.URL == fullURL {
+					found = true
+					t.Errorf("URL %s should exist but was found in errors", urlPath)
+					break
+				}
+			}
+
+			if !found {
+				t.Errorf("Expected URL %s was not found in results", urlPath)
+			}
+		}
+	}
+
+	// Check that not_found.html is in errors
+	notFoundURL := serverURL + "/not_found.html"
+	foundInErrors := false
+	for _, result := range results.Errors {
+		if result.URL == notFoundURL {
+			foundInErrors = true
+			break
+		}
+	}
+
+	if !foundInErrors {
+		t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
+	}
+
+	// Verify relative links in the rel directory
+	relNotFoundURL := serverURL + "/rel/rel_not_found.html"
+	foundRelNotFound := false
+	for _, result := range results.Errors {
+		if result.URL == relNotFoundURL {
+			foundRelNotFound = true
+			break
+		}
+	}
+
+	if !foundRelNotFound {
+		t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
+	}
+
+	// Check for missing image
+	missingImageURL := serverURL + "/rel/image-404.jpg"
+	foundMissingImage := false
+	for _, result := range results.Errors {
+		if result.URL == missingImageURL {
+			foundMissingImage = true
+			break
+		}
+	}
+
+	if !foundMissingImage {
+		t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
+	}
+
+	// Check for external links
+	externalLinkCount := 0
+	for _, result := range results.Successes {
+		if strings.Contains(result.URL, "fmartingr.com") {
+			externalLinkCount++
+		}
+	}
+
+	if externalLinkCount != 0 {
+		t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
+	}
+
+	// Verify total count
+	expectedTotal := len(results.Successes) + len(results.Errors)
+	if results.Total != expectedTotal {
+		t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
+	}
+}
+
+func TestTestsiteWithExternalLinks(t *testing.T) {
+	// Skip if running short tests
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Start the testsite server
+	serverURL, cleanup, err := testutil.StartTestsiteServer()
+	if err != nil {
+		t.Fatalf("Failed to start test server: %v", err)
+	}
+	defer cleanup()
+
+	// Create a new scraper with external links allowed
+	s := New(
+		WithConcurrency(2),
+		WithDepth(1), // Lower depth for external links test
+		WithTimeout(5),
+		WithVerbose(false),
+		WithInternalOnly(false), // Allow external links
+	)
+
+	// Run the scraper
+	results, err := s.Scan(serverURL)
+	if err != nil {
+		t.Fatalf("Scraper.Scan failed: %v", err)
+	}
+
+	// Check for external links - we should find at least one external link
+	foundExternalLinks := false
+	brokenExternalLinks := false
+
+	for _, result := range results.Successes {
+		if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
+			foundExternalLinks = true
+			break
+		}
+	}
+
+	for _, result := range results.Errors {
+		if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
+			brokenExternalLinks = true
+			break
+		}
+	}
+
+	// We don't actually hit external URLs in tests, so we can't assert on them,
+	// but we can check that they're properly identified
+	if !foundExternalLinks {
+		t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
+	}
+
+	if !brokenExternalLinks {
+		t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
+	}
+}
--- a/pkg/testutil/testserver.go
+++ b/pkg/testutil/testserver.go
@ -0,0 +1,79 @@
+package testutil
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+)
+
+// StartTestsiteServer starts an HTTP server for the testsite directory
+// and returns the server URL and a cleanup function
+func StartTestsiteServer() (string, func(), error) {
+	// Determine the absolute path to the testsite directory
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", nil, fmt.Errorf("failed to get working directory: %v", err)
+	}
+
+	// Navigate up to find the project root
+	// This assumes the function is called from somewhere within the project
+	projectRoot := wd
+	for {
+		if _, err := os.Stat(filepath.Join(projectRoot, "testsite")); err == nil {
+			break // Found the testsite directory
+		}
+		parent := filepath.Dir(projectRoot)
+		if parent == projectRoot {
+			return "", nil, fmt.Errorf("testsite directory not found")
+		}
+		projectRoot = parent
+	}
+
+	testsitePath := filepath.Join(projectRoot, "testsite")
+	if _, err := os.Stat(testsitePath); err != nil {
+		return "", nil, fmt.Errorf("testsite directory not found: %v", err)
+	}
+
+	// Create a file server for the testsite directory
+	fileServer := http.FileServer(http.Dir(testsitePath))
+
+	// Create a test server to serve the testsite files
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		fileServer.ServeHTTP(w, r)
+	}))
+
+	// Return the server URL and a cleanup function
+	return server.URL, server.Close, nil
+}
+
+// GetTestsitePath returns the absolute path to the testsite directory
+func GetTestsitePath() (string, error) {
+	// Determine the absolute path to the testsite directory
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", fmt.Errorf("failed to get working directory: %v", err)
+	}
+
+	// Navigate up to find the project root
+	// This assumes the function is called from somewhere within the project
+	projectRoot := wd
+	for {
+		if _, err := os.Stat(filepath.Join(projectRoot, "testsite")); err == nil {
+			break // Found the testsite directory
+		}
+		parent := filepath.Dir(projectRoot)
+		if parent == projectRoot {
+			return "", fmt.Errorf("testsite directory not found")
+		}
+		projectRoot = parent
+	}
+
+	testsitePath := filepath.Join(projectRoot, "testsite")
+	if _, err := os.Stat(testsitePath); err != nil {
+		return "", fmt.Errorf("testsite directory not found: %v", err)
+	}
+
+	return testsitePath, nil
+}