package scraper import ( "strings" "testing" "git.nakama.town/fmartingr/dharma/pkg/testutil" ) func TestTestsiteIntegration(t *testing.T) { // Skip if running short tests if testing.Short() { t.Skip("Skipping integration test in short mode") } // Start the testsite server serverURL, cleanup, err := testutil.StartTestsiteServer() if err != nil { t.Fatalf("Failed to start test server: %v", err) } defer cleanup() // Create a new scraper with default settings s := New( WithConcurrency(2), WithDepth(3), WithTimeout(5), WithVerbose(false), WithInternalOnly(true), ) // Run the scraper results, err := s.Scan(serverURL) if err != nil { t.Fatalf("Scraper.Scan failed: %v", err) } // Verify we have results if results == nil { t.Fatal("Expected results but got nil") } // Check that we have the correct base URL if results.BaseURL != serverURL { t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL) } // Test cases for specific URLs that should be found expectedURLs := map[string]bool{ "/found.html": true, "/not_found.html": false, "/rel/index.html": true, "/rel/relfound.html": true, "/static/style.css": true, "/static/script.js": true, } // Check for expected URLs in the results for urlPath, shouldExist := range expectedURLs { fullURL := serverURL + urlPath found := false // Look in both successes and errors for _, result := range results.Successes { if result.URL == fullURL { found = true if !shouldExist { t.Errorf("URL %s should not exist but was found in successes", urlPath) } break } } if !found && shouldExist { // If not found in successes, check if it's in errors for _, result := range results.Errors { if result.URL == fullURL { found = true t.Errorf("URL %s should exist but was found in errors", urlPath) break } } if !found { t.Errorf("Expected URL %s was not found in results", urlPath) } } } // Check that not_found.html is in errors notFoundURL := serverURL + "/not_found.html" foundInErrors := false for _, result := range results.Errors { if result.URL == notFoundURL { foundInErrors = true break } } if !foundInErrors { t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL) } // Verify relative links in the rel directory relNotFoundURL := serverURL + "/rel/rel_not_found.html" foundRelNotFound := false for _, result := range results.Errors { if result.URL == relNotFoundURL { foundRelNotFound = true break } } if !foundRelNotFound { t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL) } // Check for missing image missingImageURL := serverURL + "/rel/image-404.jpg" foundMissingImage := false for _, result := range results.Errors { if result.URL == missingImageURL { foundMissingImage = true break } } if !foundMissingImage { t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL) } // Check for external links externalLinkCount := 0 for _, result := range results.Successes { if strings.Contains(result.URL, "fmartingr.com") { externalLinkCount++ } } if externalLinkCount != 0 { t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount) } // Verify total count expectedTotal := len(results.Successes) + len(results.Errors) if results.Total != expectedTotal { t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total) } } func TestTestsiteWithExternalLinks(t *testing.T) { // Skip if running short tests if testing.Short() { t.Skip("Skipping integration test in short mode") } // Start the testsite server serverURL, cleanup, err := testutil.StartTestsiteServer() if err != nil { t.Fatalf("Failed to start test server: %v", err) } defer cleanup() // Create a new scraper with external links allowed s := New( WithConcurrency(2), WithDepth(1), // Lower depth for external links test WithTimeout(5), WithVerbose(false), WithInternalOnly(false), // Allow external links ) // Run the scraper results, err := s.Scan(serverURL) if err != nil { t.Fatalf("Scraper.Scan failed: %v", err) } // Check for external links - we should find at least one external link foundExternalLinks := false brokenExternalLinks := false for _, result := range results.Successes { if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") { foundExternalLinks = true break } } for _, result := range results.Errors { if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") { brokenExternalLinks = true break } } // We don't actually hit external URLs in tests, so we can't assert on them, // but we can check that they're properly identified if !foundExternalLinks { t.Log("No successful external links found in test - this is expected in CI/isolated test environments") } if !brokenExternalLinks { t.Log("No broken external links found in test - this is expected in CI/isolated test environments") } }