initial release
All checks were successful
ci/woodpecker/tag/release Pipeline was successful

This commit is contained in:
Felipe M 2025-05-04 10:40:26 +02:00
commit 0ef15167d5
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
28 changed files with 2789 additions and 0 deletions

669
pkg/scraper/scraper.go Normal file
View file

@ -0,0 +1,669 @@
package scraper
import (
"bufio"
"fmt"
"net/http"
"net/url"
"regexp"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
var (
urlRegexp = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
)
// Result represents a URL check result
type Result struct {
URL string `json:"url"`
SourceURL string `json:"source_url,omitempty"`
Status int `json:"status"`
Error string `json:"error,omitempty"`
Type string `json:"type"` // link, image, script, stylesheet, css-import
IsExternal bool `json:"is_external"`
}
// Results is a collection of Result
type Results struct {
BaseURL string `json:"base_url"`
Errors []Result `json:"errors"`
Successes []Result `json:"successes"`
Total int `json:"total"`
}
// QueueItem represents a URL to be processed along with its source
type QueueItem struct {
URL string
SourceURL string
Depth int
}
// Scraper handles website crawling and link checking
type Scraper struct {
client *http.Client
concurrency int
depth int
verbose bool
internalOnly bool
visitedURLs map[string]bool // URLs visited for crawling
checkedURLs map[string]Result // URLs already checked to avoid duplicate requests
mu sync.Mutex
}
// Option is a function option for the Scraper
type Option func(*Scraper)
// WithConcurrency sets the concurrency level
func WithConcurrency(concurrency int) Option {
return func(s *Scraper) {
s.concurrency = concurrency
}
}
// WithDepth sets the maximum crawling depth
func WithDepth(depth int) Option {
return func(s *Scraper) {
s.depth = depth
}
}
// WithTimeout sets the timeout for HTTP requests
func WithTimeout(timeoutSec int) Option {
return func(s *Scraper) {
s.client.Timeout = time.Duration(timeoutSec) * time.Second
}
}
// WithVerbose enables verbose output
func WithVerbose(verbose bool) Option {
return func(s *Scraper) {
s.verbose = verbose
}
}
// WithInternalOnly sets whether to only check internal links
func WithInternalOnly(internalOnly bool) Option {
return func(s *Scraper) {
s.internalOnly = internalOnly
}
}
// New creates a new Scraper with the given options
func New(options ...Option) *Scraper {
s := &Scraper{
client: &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
concurrency: 10,
depth: 3,
visitedURLs: make(map[string]bool),
checkedURLs: make(map[string]Result),
}
for _, option := range options {
option(s)
}
return s
}
// Scan starts the website crawling process
func (s *Scraper) Scan(baseURL string) (*Results, error) {
parsedURL, err := url.Parse(baseURL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
// Ensure the base URL has a scheme
if parsedURL.Scheme == "" {
parsedURL.Scheme = "https"
baseURL = parsedURL.String()
}
// Store the base hostname for distinguishing internal vs external links
baseHostname := parsedURL.Hostname()
results := &Results{
BaseURL: baseURL,
}
// Create a waitgroup to track active workers
var wg sync.WaitGroup
// Create a channel to communicate URLs to process
queue := make(chan QueueItem, 1000)
// Create a channel to track active URL processing
activeCount := make(chan int, 1)
activeCount <- 1 // Start with 1 active URL (the base URL)
// Start worker pool
for range s.concurrency {
wg.Add(1)
go func() {
defer wg.Done()
for item := range queue {
s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
}
}()
}
// Initial URL to process - the source is itself for the initial URL
queue <- QueueItem{
URL: baseURL,
SourceURL: baseURL,
Depth: 0,
}
// Monitor active count - when it reaches 0, we're done
go func() {
for {
count := <-activeCount
if count <= 0 {
close(queue)
return
}
activeCount <- count
}
}()
// Wait for workers to finish
wg.Wait()
results.Total = len(results.Errors) + len(results.Successes)
return results, nil
}
// processURL processes a single URL
func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
// Decrement active count when done
defer func() {
count := <-activeCount
activeCount <- count - 1
}()
// Check if we've already visited this URL (for crawling) or exceeded max depth
s.mu.Lock()
if s.visitedURLs[currentURL] || depth > s.depth {
s.mu.Unlock()
return
}
s.visitedURLs[currentURL] = true
// If we've already checked this URL's status, reuse the result
if result, exists := s.checkedURLs[currentURL]; exists {
// Always use the provided source URL
result.SourceURL = sourceURL
s.mu.Unlock()
s.addResult(results, result)
// We still need to parse HTML/CSS content even if we've checked the URL before
// But only if it was successful
if result.Error == "" && result.Status < 400 {
// Continue with content parsing...
} else {
return
}
} else {
s.mu.Unlock()
}
if s.verbose {
fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
}
// Parse the current URL
currentParsed, err := url.Parse(currentURL)
if err != nil {
result := Result{
URL: currentURL,
SourceURL: sourceURL,
Error: fmt.Sprintf("Invalid URL: %v", err),
Type: "link",
IsExternal: false,
}
s.mu.Lock()
s.checkedURLs[currentURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
// Determine if the URL is internal or external
isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""
// Skip external links processing if internalOnly is set
if isExternal && s.internalOnly {
return
}
// Process external links differently from internal links
if isExternal {
s.checkExternalURL(currentURL, sourceURL, results)
return
}
// Internal URL, check and crawl
resp, err := s.client.Get(currentURL)
if err != nil {
result := Result{
URL: currentURL,
SourceURL: sourceURL,
Error: err.Error(),
Type: "link",
IsExternal: isExternal,
}
s.mu.Lock()
s.checkedURLs[currentURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
defer resp.Body.Close()
// Add the result
var result Result
if resp.StatusCode >= 400 {
result = Result{
URL: currentURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
Type: "link",
IsExternal: isExternal,
}
s.mu.Lock()
s.checkedURLs[currentURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
} else {
result = Result{
URL: currentURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Type: "link",
IsExternal: isExternal,
}
s.mu.Lock()
s.checkedURLs[currentURL] = result
s.mu.Unlock()
s.addResult(results, result)
}
// Only parse HTML and CSS from internal links
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
// Use the base hostname to create a base URL for this site
baseURL := ""
if currentParsed.Scheme != "" && currentParsed.Host != "" {
baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
}
foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
// Add all found URLs to the queue and increment active count
if len(foundURLs) > 0 {
count := <-activeCount
count += len(foundURLs)
activeCount <- count
for _, url := range foundURLs {
queue <- QueueItem{
URL: url,
SourceURL: currentURL, // The source URL is the current page we're processing
Depth: depth + 1,
}
}
}
} else if strings.Contains(contentType, "text/css") {
// Use the base hostname to create a base URL for this site
baseURL := ""
if currentParsed.Scheme != "" && currentParsed.Host != "" {
baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
}
s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
}
}
// parseHTML extracts links and other resources from HTML
func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
foundURLs := []string{}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
s.addResult(results, Result{
URL: sourceURL,
SourceURL: sourceURL, // Use self as source for error
Error: fmt.Sprintf("Failed to parse HTML: %v", err),
Status: resp.StatusCode,
Type: "html",
IsExternal: false,
})
return foundURLs
}
// Process links (a href)
doc.Find("a").Each(func(i int, sel *goquery.Selection) {
if href, exists := sel.Attr("href"); exists {
targetURL := s.resolveURL(href, sourceURL)
if targetURL != "" {
linkParsed, err := url.Parse(targetURL)
if err == nil {
isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""
// Only add internal links to the crawl queue
if !isExternal {
foundURLs = append(foundURLs, targetURL)
} else if !s.internalOnly {
// Check external links only if internalOnly is false
s.checkExternalURL(targetURL, sourceURL, results)
}
}
}
}
})
// Process images
doc.Find("img").Each(func(i int, sel *goquery.Selection) {
if src, exists := sel.Attr("src"); exists {
targetURL := s.resolveURL(src, sourceURL)
if targetURL != "" {
s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
}
}
})
// Process stylesheets
doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
if href, exists := sel.Attr("href"); exists {
targetURL := s.resolveURL(href, sourceURL)
if targetURL != "" {
s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
}
}
})
// Process scripts
doc.Find("script").Each(func(i int, sel *goquery.Selection) {
if src, exists := sel.Attr("src"); exists {
targetURL := s.resolveURL(src, sourceURL)
if targetURL != "" {
s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
}
}
})
return foundURLs
}
// parseCSS extracts URLs from CSS content
func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
// Simple regex-based parsing for CSS imports and url() references
// This is a simplified approach; a proper CSS parser would be better
// for production use
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
// Look for url() references
urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
for _, match := range urlMatches {
if len(match) > 1 {
// Remove quotes if present
urlValue := strings.Trim(match[1], "'\"")
targetURL := s.resolveURL(urlValue, sourceURL)
if targetURL != "" {
s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
}
}
}
// Look for @import statements
importMatches := importRegexp.FindAllStringSubmatch(line, -1)
for _, match := range importMatches {
if len(match) > 1 {
// Remove quotes if present
urlValue := strings.Trim(match[1], "'\"")
targetURL := s.resolveURL(urlValue, sourceURL)
if targetURL != "" {
s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
}
}
}
}
}
// resolveURL resolves a relative URL against a base URL
func (s *Scraper) resolveURL(href, sourceURL string) string {
// Skip empty URLs, anchors, and javascript
if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
return ""
}
// Skip non-HTTP protocols like mailto:, tel:, etc.
if strings.HasPrefix(href, "mailto:") ||
strings.HasPrefix(href, "tel:") ||
strings.HasPrefix(href, "sms:") ||
strings.HasPrefix(href, "ftp:") ||
strings.HasPrefix(href, "file:") {
return ""
}
sourceParsed, err := url.Parse(sourceURL)
if err != nil {
return ""
}
targetParsed, err := url.Parse(href)
if err != nil {
return ""
}
// If the scheme is not HTTP/HTTPS, skip it
if targetParsed.Scheme != "" &&
targetParsed.Scheme != "http" &&
targetParsed.Scheme != "https" {
return ""
}
resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
return resolvedURL
}
// checkExternalURL performs a HEAD request to check external URLs
func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
// Skip external links if internalOnly is set
if s.internalOnly {
return
}
// Check if URL was already checked
s.mu.Lock()
if result, exists := s.checkedURLs[targetURL]; exists {
// Add the existing result with the current source URL
result.SourceURL = sourceURL
s.mu.Unlock()
s.addResult(results, result)
return
}
s.mu.Unlock()
req, err := http.NewRequest("HEAD", targetURL, nil)
if err != nil {
result := Result{
URL: targetURL,
SourceURL: sourceURL,
Error: fmt.Sprintf("Invalid URL: %v", err),
Type: "external-link",
IsExternal: true,
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
resp, err := s.client.Do(req)
if err != nil {
result := Result{
URL: targetURL,
SourceURL: sourceURL,
Error: err.Error(),
Type: "external-link",
IsExternal: true,
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
defer resp.Body.Close()
var result Result
if resp.StatusCode >= 400 {
result = Result{
URL: targetURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
Type: "external-link",
IsExternal: true,
}
} else {
result = Result{
URL: targetURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Type: "external-link",
IsExternal: true,
}
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
}
// checkResource checks if a resource URL is accessible
func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
// Parse the target URL to determine if it's internal or external
targetParsed, err := url.Parse(targetURL)
if err != nil {
result := Result{
URL: targetURL,
SourceURL: sourceURL,
Error: fmt.Sprintf("Invalid URL: %v", err),
Type: resourceType,
IsExternal: false,
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""
// Skip external resources if internalOnly is set
if isExternal && s.internalOnly {
return
}
// Check if URL was already checked
s.mu.Lock()
if result, exists := s.checkedURLs[targetURL]; exists {
// Update with current source and type if needed
result.SourceURL = sourceURL
result.Type = resourceType
s.mu.Unlock()
s.addResult(results, result)
return
}
s.mu.Unlock()
req, err := http.NewRequest("HEAD", targetURL, nil)
if err != nil {
result := Result{
URL: targetURL,
SourceURL: sourceURL,
Error: fmt.Sprintf("Invalid URL: %v", err),
Type: resourceType,
IsExternal: isExternal,
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
resp, err := s.client.Do(req)
if err != nil {
result := Result{
URL: targetURL,
SourceURL: sourceURL,
Error: err.Error(),
Type: resourceType,
IsExternal: isExternal,
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
return
}
defer resp.Body.Close()
var result Result
if resp.StatusCode >= 400 {
result = Result{
URL: targetURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
Type: resourceType,
IsExternal: isExternal,
}
} else {
result = Result{
URL: targetURL,
SourceURL: sourceURL,
Status: resp.StatusCode,
Type: resourceType,
IsExternal: isExternal,
}
}
s.mu.Lock()
s.checkedURLs[targetURL] = result
s.mu.Unlock()
s.addResult(results, result)
}
// addResult adds a result to the appropriate list
func (s *Scraper) addResult(results *Results, result Result) {
s.mu.Lock()
defer s.mu.Unlock()
if result.Error != "" {
results.Errors = append(results.Errors, result)
} else {
results.Successes = append(results.Successes, result)
}
}

163
pkg/scraper/scraper_test.go Normal file
View file

@ -0,0 +1,163 @@
package scraper
import (
"net/http"
"net/http/httptest"
"net/url"
"testing"
)
func TestResolveURL(t *testing.T) {
s := &Scraper{}
tests := []struct {
name string
href string
sourceURL string
baseURL string
wantResult string
}{
{
name: "Absolute URL",
href: "https://example.com/page.html",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "https://example.com/page.html",
},
{
name: "Relative URL",
href: "page.html",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "https://example.org/page.html",
},
{
name: "Anchor link",
href: "#section",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "",
},
{
name: "JavaScript link",
href: "javascript:void(0)",
sourceURL: "https://example.org/index.html",
baseURL: "https://example.org/",
wantResult: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := s.resolveURL(tt.href, tt.sourceURL)
if got != tt.wantResult {
t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
}
})
}
}
func TestNew(t *testing.T) {
s := New(
WithConcurrency(20),
WithDepth(5),
WithTimeout(30),
WithVerbose(true),
)
if s.concurrency != 20 {
t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
}
if s.depth != 5 {
t.Errorf("Expected depth to be 5, got %d", s.depth)
}
if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
}
if !s.verbose {
t.Errorf("Expected verbose to be true")
}
}
func TestAddResult(t *testing.T) {
s := &Scraper{}
results := &Results{}
// Add an error result
errorResult := Result{
URL: "https://example.com/error",
Error: "Test error",
Type: "link",
}
s.addResult(results, errorResult)
if len(results.Errors) != 1 {
t.Errorf("Expected 1 error, got %d", len(results.Errors))
}
// Add a success result
successResult := Result{
URL: "https://example.com/success",
Status: 200,
Type: "link",
}
s.addResult(results, successResult)
if len(results.Successes) != 1 {
t.Errorf("Expected 1 success, got %d", len(results.Successes))
}
}
func TestProcessURL(t *testing.T) {
// Create a test server
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
if _, err := w.Write([]byte(`
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="/style.css">
<script src="/script.js"></script>
</head>
<body>
<a href="/page1.html">Page 1</a>
<a href="https://example.com">External</a>
<img src="/image.jpg">
</body>
</html>
`)); err != nil {
t.Fatalf("Failed to write response: %v", err)
}
}))
defer server.Close()
s := New(WithDepth(1), WithConcurrency(1))
results := &Results{}
// Create a channel for QueueItems instead of strings
queue := make(chan QueueItem, 10)
// Create active count channel
activeCount := make(chan int, 1)
activeCount <- 1 // Start with one active URL
// Parse the server URL to get the hostname
serverURL, _ := url.Parse(server.URL)
baseHostname := serverURL.Hostname()
// Process the URL with the updated signature
s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
// Check that we found at least one success (the main page)
if len(results.Successes) < 1 {
t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
}
// Check that we queued some URLs for processing
if len(queue) < 1 {
t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
}
}

View file

@ -0,0 +1,206 @@
package scraper
import (
"strings"
"testing"
"git.nakama.town/fmartingr/dharma/pkg/testutil"
)
func TestTestsiteIntegration(t *testing.T) {
// Skip if running short tests
if testing.Short() {
t.Skip("Skipping integration test in short mode")
}
// Start the testsite server
serverURL, cleanup, err := testutil.StartTestsiteServer()
if err != nil {
t.Fatalf("Failed to start test server: %v", err)
}
defer cleanup()
// Create a new scraper with default settings
s := New(
WithConcurrency(2),
WithDepth(3),
WithTimeout(5),
WithVerbose(false),
WithInternalOnly(true),
)
// Run the scraper
results, err := s.Scan(serverURL)
if err != nil {
t.Fatalf("Scraper.Scan failed: %v", err)
}
// Verify we have results
if results == nil {
t.Fatal("Expected results but got nil")
}
// Check that we have the correct base URL
if results.BaseURL != serverURL {
t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
}
// Test cases for specific URLs that should be found
expectedURLs := map[string]bool{
"/found.html": true,
"/not_found.html": false,
"/rel/index.html": true,
"/rel/relfound.html": true,
"/static/style.css": true,
"/static/script.js": true,
}
// Check for expected URLs in the results
for urlPath, shouldExist := range expectedURLs {
fullURL := serverURL + urlPath
found := false
// Look in both successes and errors
for _, result := range results.Successes {
if result.URL == fullURL {
found = true
if !shouldExist {
t.Errorf("URL %s should not exist but was found in successes", urlPath)
}
break
}
}
if !found && shouldExist {
// If not found in successes, check if it's in errors
for _, result := range results.Errors {
if result.URL == fullURL {
found = true
t.Errorf("URL %s should exist but was found in errors", urlPath)
break
}
}
if !found {
t.Errorf("Expected URL %s was not found in results", urlPath)
}
}
}
// Check that not_found.html is in errors
notFoundURL := serverURL + "/not_found.html"
foundInErrors := false
for _, result := range results.Errors {
if result.URL == notFoundURL {
foundInErrors = true
break
}
}
if !foundInErrors {
t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
}
// Verify relative links in the rel directory
relNotFoundURL := serverURL + "/rel/rel_not_found.html"
foundRelNotFound := false
for _, result := range results.Errors {
if result.URL == relNotFoundURL {
foundRelNotFound = true
break
}
}
if !foundRelNotFound {
t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
}
// Check for missing image
missingImageURL := serverURL + "/rel/image-404.jpg"
foundMissingImage := false
for _, result := range results.Errors {
if result.URL == missingImageURL {
foundMissingImage = true
break
}
}
if !foundMissingImage {
t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
}
// Check for external links
externalLinkCount := 0
for _, result := range results.Successes {
if strings.Contains(result.URL, "fmartingr.com") {
externalLinkCount++
}
}
if externalLinkCount != 0 {
t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
}
// Verify total count
expectedTotal := len(results.Successes) + len(results.Errors)
if results.Total != expectedTotal {
t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
}
}
func TestTestsiteWithExternalLinks(t *testing.T) {
// Skip if running short tests
if testing.Short() {
t.Skip("Skipping integration test in short mode")
}
// Start the testsite server
serverURL, cleanup, err := testutil.StartTestsiteServer()
if err != nil {
t.Fatalf("Failed to start test server: %v", err)
}
defer cleanup()
// Create a new scraper with external links allowed
s := New(
WithConcurrency(2),
WithDepth(1), // Lower depth for external links test
WithTimeout(5),
WithVerbose(false),
WithInternalOnly(false), // Allow external links
)
// Run the scraper
results, err := s.Scan(serverURL)
if err != nil {
t.Fatalf("Scraper.Scan failed: %v", err)
}
// Check for external links - we should find at least one external link
foundExternalLinks := false
brokenExternalLinks := false
for _, result := range results.Successes {
if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
foundExternalLinks = true
break
}
}
for _, result := range results.Errors {
if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
brokenExternalLinks = true
break
}
}
// We don't actually hit external URLs in tests, so we can't assert on them,
// but we can check that they're properly identified
if !foundExternalLinks {
t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
}
if !brokenExternalLinks {
t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
}
}