206 lines
5 KiB
Go
206 lines
5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
|
|
"git.nakama.town/fmartingr/dharma/pkg/testutil"
|
|
)
|
|
|
|
func TestTestsiteIntegration(t *testing.T) {
|
|
// Skip if running short tests
|
|
if testing.Short() {
|
|
t.Skip("Skipping integration test in short mode")
|
|
}
|
|
|
|
// Start the testsite server
|
|
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
|
if err != nil {
|
|
t.Fatalf("Failed to start test server: %v", err)
|
|
}
|
|
defer cleanup()
|
|
|
|
// Create a new scraper with default settings
|
|
s := New(
|
|
WithConcurrency(2),
|
|
WithDepth(3),
|
|
WithTimeout(5),
|
|
WithVerbose(false),
|
|
WithInternalOnly(true),
|
|
)
|
|
|
|
// Run the scraper
|
|
results, err := s.Scan(serverURL)
|
|
if err != nil {
|
|
t.Fatalf("Scraper.Scan failed: %v", err)
|
|
}
|
|
|
|
// Verify we have results
|
|
if results == nil {
|
|
t.Fatal("Expected results but got nil")
|
|
}
|
|
|
|
// Check that we have the correct base URL
|
|
if results.BaseURL != serverURL {
|
|
t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
|
|
}
|
|
|
|
// Test cases for specific URLs that should be found
|
|
expectedURLs := map[string]bool{
|
|
"/found.html": true,
|
|
"/not_found.html": false,
|
|
"/rel/index.html": true,
|
|
"/rel/relfound.html": true,
|
|
"/static/style.css": true,
|
|
"/static/script.js": true,
|
|
}
|
|
|
|
// Check for expected URLs in the results
|
|
for urlPath, shouldExist := range expectedURLs {
|
|
fullURL := serverURL + urlPath
|
|
found := false
|
|
|
|
// Look in both successes and errors
|
|
for _, result := range results.Successes {
|
|
if result.URL == fullURL {
|
|
found = true
|
|
if !shouldExist {
|
|
t.Errorf("URL %s should not exist but was found in successes", urlPath)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if !found && shouldExist {
|
|
// If not found in successes, check if it's in errors
|
|
for _, result := range results.Errors {
|
|
if result.URL == fullURL {
|
|
found = true
|
|
t.Errorf("URL %s should exist but was found in errors", urlPath)
|
|
break
|
|
}
|
|
}
|
|
|
|
if !found {
|
|
t.Errorf("Expected URL %s was not found in results", urlPath)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check that not_found.html is in errors
|
|
notFoundURL := serverURL + "/not_found.html"
|
|
foundInErrors := false
|
|
for _, result := range results.Errors {
|
|
if result.URL == notFoundURL {
|
|
foundInErrors = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !foundInErrors {
|
|
t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
|
|
}
|
|
|
|
// Verify relative links in the rel directory
|
|
relNotFoundURL := serverURL + "/rel/rel_not_found.html"
|
|
foundRelNotFound := false
|
|
for _, result := range results.Errors {
|
|
if result.URL == relNotFoundURL {
|
|
foundRelNotFound = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !foundRelNotFound {
|
|
t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
|
|
}
|
|
|
|
// Check for missing image
|
|
missingImageURL := serverURL + "/rel/image-404.jpg"
|
|
foundMissingImage := false
|
|
for _, result := range results.Errors {
|
|
if result.URL == missingImageURL {
|
|
foundMissingImage = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !foundMissingImage {
|
|
t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
|
|
}
|
|
|
|
// Check for external links
|
|
externalLinkCount := 0
|
|
for _, result := range results.Successes {
|
|
if strings.Contains(result.URL, "fmartingr.com") {
|
|
externalLinkCount++
|
|
}
|
|
}
|
|
|
|
if externalLinkCount != 0 {
|
|
t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
|
|
}
|
|
|
|
// Verify total count
|
|
expectedTotal := len(results.Successes) + len(results.Errors)
|
|
if results.Total != expectedTotal {
|
|
t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
|
|
}
|
|
}
|
|
|
|
func TestTestsiteWithExternalLinks(t *testing.T) {
|
|
// Skip if running short tests
|
|
if testing.Short() {
|
|
t.Skip("Skipping integration test in short mode")
|
|
}
|
|
|
|
// Start the testsite server
|
|
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
|
if err != nil {
|
|
t.Fatalf("Failed to start test server: %v", err)
|
|
}
|
|
defer cleanup()
|
|
|
|
// Create a new scraper with external links allowed
|
|
s := New(
|
|
WithConcurrency(2),
|
|
WithDepth(1), // Lower depth for external links test
|
|
WithTimeout(5),
|
|
WithVerbose(false),
|
|
WithInternalOnly(false), // Allow external links
|
|
)
|
|
|
|
// Run the scraper
|
|
results, err := s.Scan(serverURL)
|
|
if err != nil {
|
|
t.Fatalf("Scraper.Scan failed: %v", err)
|
|
}
|
|
|
|
// Check for external links - we should find at least one external link
|
|
foundExternalLinks := false
|
|
brokenExternalLinks := false
|
|
|
|
for _, result := range results.Successes {
|
|
if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
|
|
foundExternalLinks = true
|
|
break
|
|
}
|
|
}
|
|
|
|
for _, result := range results.Errors {
|
|
if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
|
|
brokenExternalLinks = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// We don't actually hit external URLs in tests, so we can't assert on them,
|
|
// but we can check that they're properly identified
|
|
if !foundExternalLinks {
|
|
t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
|
|
}
|
|
|
|
if !brokenExternalLinks {
|
|
t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
|
|
}
|
|
}
|