dharma/pkg/scraper/testsite_test.go

package scraper

import (
	"strings"
	"testing"

	"git.nakama.town/fmartingr/dharma/pkg/testutil"
)

func TestTestsiteIntegration(t *testing.T) {
	// Skip if running short tests
	if testing.Short() {
		t.Skip("Skipping integration test in short mode")
	}

	// Start the testsite server
	serverURL, cleanup, err := testutil.StartTestsiteServer()
	if err != nil {
		t.Fatalf("Failed to start test server: %v", err)
	}
	defer cleanup()

	// Create a new scraper with default settings
	s := New(
		WithConcurrency(2),
		WithDepth(3),
		WithTimeout(5),
		WithVerbose(false),
		WithInternalOnly(true),
	)

	// Run the scraper
	results, err := s.Scan(serverURL)
	if err != nil {
		t.Fatalf("Scraper.Scan failed: %v", err)
	}

	// Verify we have results
	if results == nil {
		t.Fatal("Expected results but got nil")
	}

	// Check that we have the correct base URL
	if results.BaseURL != serverURL {
		t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
	}

	// Test cases for specific URLs that should be found
	expectedURLs := map[string]bool{
		"/found.html":        true,
		"/not_found.html":    false,
		"/rel/index.html":    true,
		"/rel/relfound.html": true,
		"/static/style.css":  true,
		"/static/script.js":  true,
	}

	// Check for expected URLs in the results
	for urlPath, shouldExist := range expectedURLs {
		fullURL := serverURL + urlPath
		found := false

		// Look in both successes and errors
		for _, result := range results.Successes {
			if result.URL == fullURL {
				found = true
				if !shouldExist {
					t.Errorf("URL %s should not exist but was found in successes", urlPath)
				}
				break
			}
		}

		if !found && shouldExist {
			// If not found in successes, check if it's in errors
			for _, result := range results.Errors {
				if result.URL == fullURL {
					found = true
					t.Errorf("URL %s should exist but was found in errors", urlPath)
					break
				}
			}

			if !found {
				t.Errorf("Expected URL %s was not found in results", urlPath)
			}
		}
	}

	// Check that not_found.html is in errors
	notFoundURL := serverURL + "/not_found.html"
	foundInErrors := false
	for _, result := range results.Errors {
		if result.URL == notFoundURL {
			foundInErrors = true
			break
		}
	}

	if !foundInErrors {
		t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
	}

	// Verify relative links in the rel directory
	relNotFoundURL := serverURL + "/rel/rel_not_found.html"
	foundRelNotFound := false
	for _, result := range results.Errors {
		if result.URL == relNotFoundURL {
			foundRelNotFound = true
			break
		}
	}

	if !foundRelNotFound {
		t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
	}

	// Check for missing image
	missingImageURL := serverURL + "/rel/image-404.jpg"
	foundMissingImage := false
	for _, result := range results.Errors {
		if result.URL == missingImageURL {
			foundMissingImage = true
			break
		}
	}

	if !foundMissingImage {
		t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
	}

	// Check for external links
	externalLinkCount := 0
	for _, result := range results.Successes {
		if strings.Contains(result.URL, "fmartingr.com") {
			externalLinkCount++
		}
	}

	if externalLinkCount != 0 {
		t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
	}

	// Verify total count
	expectedTotal := len(results.Successes) + len(results.Errors)
	if results.Total != expectedTotal {
		t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
	}
}

func TestTestsiteWithExternalLinks(t *testing.T) {
	// Skip if running short tests
	if testing.Short() {
		t.Skip("Skipping integration test in short mode")
	}

	// Start the testsite server
	serverURL, cleanup, err := testutil.StartTestsiteServer()
	if err != nil {
		t.Fatalf("Failed to start test server: %v", err)
	}
	defer cleanup()

	// Create a new scraper with external links allowed
	s := New(
		WithConcurrency(2),
		WithDepth(1), // Lower depth for external links test
		WithTimeout(5),
		WithVerbose(false),
		WithInternalOnly(false), // Allow external links
	)

	// Run the scraper
	results, err := s.Scan(serverURL)
	if err != nil {
		t.Fatalf("Scraper.Scan failed: %v", err)
	}

	// Check for external links - we should find at least one external link
	foundExternalLinks := false
	brokenExternalLinks := false

	for _, result := range results.Successes {
		if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
			foundExternalLinks = true
			break
		}
	}

	for _, result := range results.Errors {
		if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
			brokenExternalLinks = true
			break
		}
	}

	// We don't actually hit external URLs in tests, so we can't assert on them,
	// but we can check that they're properly identified
	if !foundExternalLinks {
		t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
	}

	if !brokenExternalLinks {
		t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
	}
}