initial release

2025-05-04 10:40:26 +02:00 · 2025-05-04 10:40:26 +02:00 · 0ef15167d5
commit 0ef15167d5
28 changed files with 2789 additions and 0 deletions
--- a/pkg/scraper/scraper.go
+++ b/pkg/scraper/scraper.go
@ -0,0 +1,669 @@
+package scraper
+
+import (
+	"bufio"
+	"fmt"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+var (
+	urlRegexp    = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
+	importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
+)
+
+// Result represents a URL check result
+type Result struct {
+	URL        string `json:"url"`
+	SourceURL  string `json:"source_url,omitempty"`
+	Status     int    `json:"status"`
+	Error      string `json:"error,omitempty"`
+	Type       string `json:"type"` // link, image, script, stylesheet, css-import
+	IsExternal bool   `json:"is_external"`
+}
+
+// Results is a collection of Result
+type Results struct {
+	BaseURL   string   `json:"base_url"`
+	Errors    []Result `json:"errors"`
+	Successes []Result `json:"successes"`
+	Total     int      `json:"total"`
+}
+
+// QueueItem represents a URL to be processed along with its source
+type QueueItem struct {
+	URL       string
+	SourceURL string
+	Depth     int
+}
+
+// Scraper handles website crawling and link checking
+type Scraper struct {
+	client       *http.Client
+	concurrency  int
+	depth        int
+	verbose      bool
+	internalOnly bool
+	visitedURLs  map[string]bool   // URLs visited for crawling
+	checkedURLs  map[string]Result // URLs already checked to avoid duplicate requests
+	mu           sync.Mutex
+}
+
+// Option is a function option for the Scraper
+type Option func(*Scraper)
+
+// WithConcurrency sets the concurrency level
+func WithConcurrency(concurrency int) Option {
+	return func(s *Scraper) {
+		s.concurrency = concurrency
+	}
+}
+
+// WithDepth sets the maximum crawling depth
+func WithDepth(depth int) Option {
+	return func(s *Scraper) {
+		s.depth = depth
+	}
+}
+
+// WithTimeout sets the timeout for HTTP requests
+func WithTimeout(timeoutSec int) Option {
+	return func(s *Scraper) {
+		s.client.Timeout = time.Duration(timeoutSec) * time.Second
+	}
+}
+
+// WithVerbose enables verbose output
+func WithVerbose(verbose bool) Option {
+	return func(s *Scraper) {
+		s.verbose = verbose
+	}
+}
+
+// WithInternalOnly sets whether to only check internal links
+func WithInternalOnly(internalOnly bool) Option {
+	return func(s *Scraper) {
+		s.internalOnly = internalOnly
+	}
+}
+
+// New creates a new Scraper with the given options
+func New(options ...Option) *Scraper {
+	s := &Scraper{
+		client: &http.Client{
+			Timeout: 10 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		concurrency: 10,
+		depth:       3,
+		visitedURLs: make(map[string]bool),
+		checkedURLs: make(map[string]Result),
+	}
+
+	for _, option := range options {
+		option(s)
+	}
+
+	return s
+}
+
+// Scan starts the website crawling process
+func (s *Scraper) Scan(baseURL string) (*Results, error) {
+	parsedURL, err := url.Parse(baseURL)
+	if err != nil {
+		return nil, fmt.Errorf("invalid URL: %w", err)
+	}
+
+	// Ensure the base URL has a scheme
+	if parsedURL.Scheme == "" {
+		parsedURL.Scheme = "https"
+		baseURL = parsedURL.String()
+	}
+
+	// Store the base hostname for distinguishing internal vs external links
+	baseHostname := parsedURL.Hostname()
+
+	results := &Results{
+		BaseURL: baseURL,
+	}
+
+	// Create a waitgroup to track active workers
+	var wg sync.WaitGroup
+
+	// Create a channel to communicate URLs to process
+	queue := make(chan QueueItem, 1000)
+
+	// Create a channel to track active URL processing
+	activeCount := make(chan int, 1)
+	activeCount <- 1 // Start with 1 active URL (the base URL)
+
+	// Start worker pool
+	for range s.concurrency {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for item := range queue {
+				s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
+			}
+		}()
+	}
+
+	// Initial URL to process - the source is itself for the initial URL
+	queue <- QueueItem{
+		URL:       baseURL,
+		SourceURL: baseURL,
+		Depth:     0,
+	}
+
+	// Monitor active count - when it reaches 0, we're done
+	go func() {
+		for {
+			count := <-activeCount
+			if count <= 0 {
+				close(queue)
+				return
+			}
+			activeCount <- count
+		}
+	}()
+
+	// Wait for workers to finish
+	wg.Wait()
+
+	results.Total = len(results.Errors) + len(results.Successes)
+	return results, nil
+}
+
+// processURL processes a single URL
+func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
+	// Decrement active count when done
+	defer func() {
+		count := <-activeCount
+		activeCount <- count - 1
+	}()
+
+	// Check if we've already visited this URL (for crawling) or exceeded max depth
+	s.mu.Lock()
+	if s.visitedURLs[currentURL] || depth > s.depth {
+		s.mu.Unlock()
+		return
+	}
+	s.visitedURLs[currentURL] = true
+
+	// If we've already checked this URL's status, reuse the result
+	if result, exists := s.checkedURLs[currentURL]; exists {
+		// Always use the provided source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+
+		// We still need to parse HTML/CSS content even if we've checked the URL before
+		// But only if it was successful
+		if result.Error == "" && result.Status < 400 {
+			// Continue with content parsing...
+		} else {
+			return
+		}
+	} else {
+		s.mu.Unlock()
+	}
+
+	if s.verbose {
+		fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
+	}
+
+	// Parse the current URL
+	currentParsed, err := url.Parse(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "link",
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	// Determine if the URL is internal or external
+	isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""
+
+	// Skip external links processing if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Process external links differently from internal links
+	if isExternal {
+		s.checkExternalURL(currentURL, sourceURL, results)
+		return
+	}
+
+	// Internal URL, check and crawl
+	resp, err := s.client.Get(currentURL)
+	if err != nil {
+		result := Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	// Add the result
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	} else {
+		result = Result{
+			URL:        currentURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "link",
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[currentURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+	}
+
+	// Only parse HTML and CSS from internal links
+	contentType := resp.Header.Get("Content-Type")
+	if strings.Contains(contentType, "text/html") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
+		// Add all found URLs to the queue and increment active count
+		if len(foundURLs) > 0 {
+			count := <-activeCount
+			count += len(foundURLs)
+			activeCount <- count
+
+			for _, url := range foundURLs {
+				queue <- QueueItem{
+					URL:       url,
+					SourceURL: currentURL, // The source URL is the current page we're processing
+					Depth:     depth + 1,
+				}
+			}
+		}
+	} else if strings.Contains(contentType, "text/css") {
+		// Use the base hostname to create a base URL for this site
+		baseURL := ""
+		if currentParsed.Scheme != "" && currentParsed.Host != "" {
+			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
+		}
+
+		s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
+	}
+}
+
+// parseHTML extracts links and other resources from HTML
+func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
+	foundURLs := []string{}
+
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		s.addResult(results, Result{
+			URL:        sourceURL,
+			SourceURL:  sourceURL, // Use self as source for error
+			Error:      fmt.Sprintf("Failed to parse HTML: %v", err),
+			Status:     resp.StatusCode,
+			Type:       "html",
+			IsExternal: false,
+		})
+		return foundURLs
+	}
+
+	// Process links (a href)
+	doc.Find("a").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				linkParsed, err := url.Parse(targetURL)
+				if err == nil {
+					isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""
+
+					// Only add internal links to the crawl queue
+					if !isExternal {
+						foundURLs = append(foundURLs, targetURL)
+					} else if !s.internalOnly {
+						// Check external links only if internalOnly is false
+						s.checkExternalURL(targetURL, sourceURL, results)
+					}
+				}
+			}
+		}
+	})
+
+	// Process images
+	doc.Find("img").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
+			}
+		}
+	})
+
+	// Process stylesheets
+	doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
+		if href, exists := sel.Attr("href"); exists {
+			targetURL := s.resolveURL(href, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
+			}
+		}
+	})
+
+	// Process scripts
+	doc.Find("script").Each(func(i int, sel *goquery.Selection) {
+		if src, exists := sel.Attr("src"); exists {
+			targetURL := s.resolveURL(src, sourceURL)
+			if targetURL != "" {
+				s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
+			}
+		}
+	})
+
+	return foundURLs
+}
+
+// parseCSS extracts URLs from CSS content
+func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
+	// Simple regex-based parsing for CSS imports and url() references
+	// This is a simplified approach; a proper CSS parser would be better
+	// for production use
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Look for url() references
+		urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range urlMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
+				}
+			}
+		}
+
+		// Look for @import statements
+		importMatches := importRegexp.FindAllStringSubmatch(line, -1)
+		for _, match := range importMatches {
+			if len(match) > 1 {
+				// Remove quotes if present
+				urlValue := strings.Trim(match[1], "'\"")
+				targetURL := s.resolveURL(urlValue, sourceURL)
+				if targetURL != "" {
+					s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
+				}
+			}
+		}
+	}
+}
+
+// resolveURL resolves a relative URL against a base URL
+func (s *Scraper) resolveURL(href, sourceURL string) string {
+	// Skip empty URLs, anchors, and javascript
+	if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
+		return ""
+	}
+
+	// Skip non-HTTP protocols like mailto:, tel:, etc.
+	if strings.HasPrefix(href, "mailto:") ||
+		strings.HasPrefix(href, "tel:") ||
+		strings.HasPrefix(href, "sms:") ||
+		strings.HasPrefix(href, "ftp:") ||
+		strings.HasPrefix(href, "file:") {
+		return ""
+	}
+
+	sourceParsed, err := url.Parse(sourceURL)
+	if err != nil {
+		return ""
+	}
+
+	targetParsed, err := url.Parse(href)
+	if err != nil {
+		return ""
+	}
+
+	// If the scheme is not HTTP/HTTPS, skip it
+	if targetParsed.Scheme != "" &&
+		targetParsed.Scheme != "http" &&
+		targetParsed.Scheme != "https" {
+		return ""
+	}
+
+	resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
+	return resolvedURL
+}
+
+// checkExternalURL performs a HEAD request to check external URLs
+func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
+	// Skip external links if internalOnly is set
+	if s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Add the existing result with the current source URL
+		result.SourceURL = sourceURL
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       "external-link",
+			IsExternal: true,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// checkResource checks if a resource URL is accessible
+func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
+	// Parse the target URL to determine if it's internal or external
+	targetParsed, err := url.Parse(targetURL)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: false,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""
+
+	// Skip external resources if internalOnly is set
+	if isExternal && s.internalOnly {
+		return
+	}
+
+	// Check if URL was already checked
+	s.mu.Lock()
+	if result, exists := s.checkedURLs[targetURL]; exists {
+		// Update with current source and type if needed
+		result.SourceURL = sourceURL
+		result.Type = resourceType
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	s.mu.Unlock()
+
+	req, err := http.NewRequest("HEAD", targetURL, nil)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      fmt.Sprintf("Invalid URL: %v", err),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		result := Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Error:      err.Error(),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+		s.mu.Lock()
+		s.checkedURLs[targetURL] = result
+		s.mu.Unlock()
+		s.addResult(results, result)
+		return
+	}
+	defer resp.Body.Close()
+
+	var result Result
+	if resp.StatusCode >= 400 {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	} else {
+		result = Result{
+			URL:        targetURL,
+			SourceURL:  sourceURL,
+			Status:     resp.StatusCode,
+			Type:       resourceType,
+			IsExternal: isExternal,
+		}
+	}
+
+	s.mu.Lock()
+	s.checkedURLs[targetURL] = result
+	s.mu.Unlock()
+	s.addResult(results, result)
+}
+
+// addResult adds a result to the appropriate list
+func (s *Scraper) addResult(results *Results, result Result) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if result.Error != "" {
+		results.Errors = append(results.Errors, result)
+	} else {
+		results.Successes = append(results.Successes, result)
+	}
+}