This commit is contained in:
commit
0ef15167d5
28 changed files with 2789 additions and 0 deletions
206
pkg/scraper/testsite_test.go
Normal file
206
pkg/scraper/testsite_test.go
Normal file
|
@ -0,0 +1,206 @@
|
|||
package scraper
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/testutil"
|
||||
)
|
||||
|
||||
func TestTestsiteIntegration(t *testing.T) {
|
||||
// Skip if running short tests
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping integration test in short mode")
|
||||
}
|
||||
|
||||
// Start the testsite server
|
||||
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to start test server: %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Create a new scraper with default settings
|
||||
s := New(
|
||||
WithConcurrency(2),
|
||||
WithDepth(3),
|
||||
WithTimeout(5),
|
||||
WithVerbose(false),
|
||||
WithInternalOnly(true),
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(serverURL)
|
||||
if err != nil {
|
||||
t.Fatalf("Scraper.Scan failed: %v", err)
|
||||
}
|
||||
|
||||
// Verify we have results
|
||||
if results == nil {
|
||||
t.Fatal("Expected results but got nil")
|
||||
}
|
||||
|
||||
// Check that we have the correct base URL
|
||||
if results.BaseURL != serverURL {
|
||||
t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
|
||||
}
|
||||
|
||||
// Test cases for specific URLs that should be found
|
||||
expectedURLs := map[string]bool{
|
||||
"/found.html": true,
|
||||
"/not_found.html": false,
|
||||
"/rel/index.html": true,
|
||||
"/rel/relfound.html": true,
|
||||
"/static/style.css": true,
|
||||
"/static/script.js": true,
|
||||
}
|
||||
|
||||
// Check for expected URLs in the results
|
||||
for urlPath, shouldExist := range expectedURLs {
|
||||
fullURL := serverURL + urlPath
|
||||
found := false
|
||||
|
||||
// Look in both successes and errors
|
||||
for _, result := range results.Successes {
|
||||
if result.URL == fullURL {
|
||||
found = true
|
||||
if !shouldExist {
|
||||
t.Errorf("URL %s should not exist but was found in successes", urlPath)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found && shouldExist {
|
||||
// If not found in successes, check if it's in errors
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == fullURL {
|
||||
found = true
|
||||
t.Errorf("URL %s should exist but was found in errors", urlPath)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
t.Errorf("Expected URL %s was not found in results", urlPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that not_found.html is in errors
|
||||
notFoundURL := serverURL + "/not_found.html"
|
||||
foundInErrors := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == notFoundURL {
|
||||
foundInErrors = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundInErrors {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
|
||||
}
|
||||
|
||||
// Verify relative links in the rel directory
|
||||
relNotFoundURL := serverURL + "/rel/rel_not_found.html"
|
||||
foundRelNotFound := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == relNotFoundURL {
|
||||
foundRelNotFound = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundRelNotFound {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
|
||||
}
|
||||
|
||||
// Check for missing image
|
||||
missingImageURL := serverURL + "/rel/image-404.jpg"
|
||||
foundMissingImage := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == missingImageURL {
|
||||
foundMissingImage = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundMissingImage {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
|
||||
}
|
||||
|
||||
// Check for external links
|
||||
externalLinkCount := 0
|
||||
for _, result := range results.Successes {
|
||||
if strings.Contains(result.URL, "fmartingr.com") {
|
||||
externalLinkCount++
|
||||
}
|
||||
}
|
||||
|
||||
if externalLinkCount != 0 {
|
||||
t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
|
||||
}
|
||||
|
||||
// Verify total count
|
||||
expectedTotal := len(results.Successes) + len(results.Errors)
|
||||
if results.Total != expectedTotal {
|
||||
t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTestsiteWithExternalLinks(t *testing.T) {
|
||||
// Skip if running short tests
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping integration test in short mode")
|
||||
}
|
||||
|
||||
// Start the testsite server
|
||||
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to start test server: %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Create a new scraper with external links allowed
|
||||
s := New(
|
||||
WithConcurrency(2),
|
||||
WithDepth(1), // Lower depth for external links test
|
||||
WithTimeout(5),
|
||||
WithVerbose(false),
|
||||
WithInternalOnly(false), // Allow external links
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(serverURL)
|
||||
if err != nil {
|
||||
t.Fatalf("Scraper.Scan failed: %v", err)
|
||||
}
|
||||
|
||||
// Check for external links - we should find at least one external link
|
||||
foundExternalLinks := false
|
||||
brokenExternalLinks := false
|
||||
|
||||
for _, result := range results.Successes {
|
||||
if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
|
||||
foundExternalLinks = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
for _, result := range results.Errors {
|
||||
if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
|
||||
brokenExternalLinks = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// We don't actually hit external URLs in tests, so we can't assert on them,
|
||||
// but we can check that they're properly identified
|
||||
if !foundExternalLinks {
|
||||
t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
|
||||
}
|
||||
|
||||
if !brokenExternalLinks {
|
||||
t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue