dharma/pkg/scraper/scraper.go

package scraper

import (
	"bufio"
	"fmt"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	"sync"
	"time"

	"github.com/PuerkitoBio/goquery"
)

var (
	urlRegexp    = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
	importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
)

// Result represents a URL check result
type Result struct {
	URL        string `json:"url"`
	SourceURL  string `json:"source_url,omitempty"`
	Status     int    `json:"status"`
	Error      string `json:"error,omitempty"`
	Type       string `json:"type"` // link, image, script, stylesheet, css-import
	IsExternal bool   `json:"is_external"`
}

// Results is a collection of Result
type Results struct {
	BaseURL   string   `json:"base_url"`
	Errors    []Result `json:"errors"`
	Successes []Result `json:"successes"`
	Total     int      `json:"total"`
}

// QueueItem represents a URL to be processed along with its source
type QueueItem struct {
	URL       string
	SourceURL string
	Depth     int
}

// Scraper handles website crawling and link checking
type Scraper struct {
	client       *http.Client
	concurrency  int
	depth        int
	verbose      bool
	internalOnly bool
	visitedURLs  map[string]bool   // URLs visited for crawling
	checkedURLs  map[string]Result // URLs already checked to avoid duplicate requests
	mu           sync.Mutex
}

// Option is a function option for the Scraper
type Option func(*Scraper)

// WithConcurrency sets the concurrency level
func WithConcurrency(concurrency int) Option {
	return func(s *Scraper) {
		s.concurrency = concurrency
	}
}

// WithDepth sets the maximum crawling depth
func WithDepth(depth int) Option {
	return func(s *Scraper) {
		s.depth = depth
	}
}

// WithTimeout sets the timeout for HTTP requests
func WithTimeout(timeoutSec int) Option {
	return func(s *Scraper) {
		s.client.Timeout = time.Duration(timeoutSec) * time.Second
	}
}

// WithVerbose enables verbose output
func WithVerbose(verbose bool) Option {
	return func(s *Scraper) {
		s.verbose = verbose
	}
}

// WithInternalOnly sets whether to only check internal links
func WithInternalOnly(internalOnly bool) Option {
	return func(s *Scraper) {
		s.internalOnly = internalOnly
	}
}

// New creates a new Scraper with the given options
func New(options ...Option) *Scraper {
	s := &Scraper{
		client: &http.Client{
			Timeout: 10 * time.Second,
			CheckRedirect: func(req *http.Request, via []*http.Request) error {
				if len(via) >= 10 {
					return fmt.Errorf("too many redirects")
				}
				return nil
			},
		},
		concurrency: 10,
		depth:       3,
		visitedURLs: make(map[string]bool),
		checkedURLs: make(map[string]Result),
	}

	for _, option := range options {
		option(s)
	}

	return s
}

// Scan starts the website crawling process
func (s *Scraper) Scan(baseURL string) (*Results, error) {
	parsedURL, err := url.Parse(baseURL)
	if err != nil {
		return nil, fmt.Errorf("invalid URL: %w", err)
	}

	// Ensure the base URL has a scheme
	if parsedURL.Scheme == "" {
		parsedURL.Scheme = "https"
		baseURL = parsedURL.String()
	}

	// Store the base hostname for distinguishing internal vs external links
	baseHostname := parsedURL.Hostname()

	results := &Results{
		BaseURL: baseURL,
	}

	// Create a waitgroup to track active workers
	var wg sync.WaitGroup

	// Create a channel to communicate URLs to process
	queue := make(chan QueueItem, 1000)

	// Create a channel to track active URL processing
	activeCount := make(chan int, 1)
	activeCount <- 1 // Start with 1 active URL (the base URL)

	// Start worker pool
	for range s.concurrency {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for item := range queue {
				s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
			}
		}()
	}

	// Initial URL to process - the source is itself for the initial URL
	queue <- QueueItem{
		URL:       baseURL,
		SourceURL: baseURL,
		Depth:     0,
	}

	// Monitor active count - when it reaches 0, we're done
	go func() {
		for {
			count := <-activeCount
			if count <= 0 {
				close(queue)
				return
			}
			activeCount <- count
		}
	}()

	// Wait for workers to finish
	wg.Wait()

	results.Total = len(results.Errors) + len(results.Successes)
	return results, nil
}

// processURL processes a single URL
func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
	// Decrement active count when done
	defer func() {
		count := <-activeCount
		activeCount <- count - 1
	}()

	// Check if we've already visited this URL (for crawling) or exceeded max depth
	s.mu.Lock()
	if s.visitedURLs[currentURL] || depth > s.depth {
		s.mu.Unlock()
		return
	}
	s.visitedURLs[currentURL] = true

	// If we've already checked this URL's status, reuse the result
	if result, exists := s.checkedURLs[currentURL]; exists {
		// Always use the provided source URL
		result.SourceURL = sourceURL
		s.mu.Unlock()
		s.addResult(results, result)

		// We still need to parse HTML/CSS content even if we've checked the URL before
		// But only if it was successful
		if result.Error == "" && result.Status < 400 {
			// Continue with content parsing...
		} else {
			return
		}
	} else {
		s.mu.Unlock()
	}

	if s.verbose {
		fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
	}

	// Parse the current URL
	currentParsed, err := url.Parse(currentURL)
	if err != nil {
		result := Result{
			URL:        currentURL,
			SourceURL:  sourceURL,
			Error:      fmt.Sprintf("Invalid URL: %v", err),
			Type:       "link",
			IsExternal: false,
		}
		s.mu.Lock()
		s.checkedURLs[currentURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}

	// Determine if the URL is internal or external
	isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""

	// Skip external links processing if internalOnly is set
	if isExternal && s.internalOnly {
		return
	}

	// Process external links differently from internal links
	if isExternal {
		s.checkExternalURL(currentURL, sourceURL, results)
		return
	}

	// Internal URL, check and crawl
	resp, err := s.client.Get(currentURL)
	if err != nil {
		result := Result{
			URL:        currentURL,
			SourceURL:  sourceURL,
			Error:      err.Error(),
			Type:       "link",
			IsExternal: isExternal,
		}
		s.mu.Lock()
		s.checkedURLs[currentURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}
	defer resp.Body.Close()

	// Add the result
	var result Result
	if resp.StatusCode >= 400 {
		result = Result{
			URL:        currentURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
			Type:       "link",
			IsExternal: isExternal,
		}
		s.mu.Lock()
		s.checkedURLs[currentURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	} else {
		result = Result{
			URL:        currentURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Type:       "link",
			IsExternal: isExternal,
		}
		s.mu.Lock()
		s.checkedURLs[currentURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
	}

	// Only parse HTML and CSS from internal links
	contentType := resp.Header.Get("Content-Type")
	if strings.Contains(contentType, "text/html") {
		// Use the base hostname to create a base URL for this site
		baseURL := ""
		if currentParsed.Scheme != "" && currentParsed.Host != "" {
			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
		}

		foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
		// Add all found URLs to the queue and increment active count
		if len(foundURLs) > 0 {
			count := <-activeCount
			count += len(foundURLs)
			activeCount <- count

			for _, url := range foundURLs {
				queue <- QueueItem{
					URL:       url,
					SourceURL: currentURL, // The source URL is the current page we're processing
					Depth:     depth + 1,
				}
			}
		}
	} else if strings.Contains(contentType, "text/css") {
		// Use the base hostname to create a base URL for this site
		baseURL := ""
		if currentParsed.Scheme != "" && currentParsed.Host != "" {
			baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
		}

		s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
	}
}

// parseHTML extracts links and other resources from HTML
func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
	foundURLs := []string{}

	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		s.addResult(results, Result{
			URL:        sourceURL,
			SourceURL:  sourceURL, // Use self as source for error
			Error:      fmt.Sprintf("Failed to parse HTML: %v", err),
			Status:     resp.StatusCode,
			Type:       "html",
			IsExternal: false,
		})
		return foundURLs
	}

	// Process links (a href)
	doc.Find("a").Each(func(i int, sel *goquery.Selection) {
		if href, exists := sel.Attr("href"); exists {
			targetURL := s.resolveURL(href, sourceURL)
			if targetURL != "" {
				linkParsed, err := url.Parse(targetURL)
				if err == nil {
					isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""

					// Only add internal links to the crawl queue
					if !isExternal {
						foundURLs = append(foundURLs, targetURL)
					} else if !s.internalOnly {
						// Check external links only if internalOnly is false
						s.checkExternalURL(targetURL, sourceURL, results)
					}
				}
			}
		}
	})

	// Process images
	doc.Find("img").Each(func(i int, sel *goquery.Selection) {
		if src, exists := sel.Attr("src"); exists {
			targetURL := s.resolveURL(src, sourceURL)
			if targetURL != "" {
				s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
			}
		}
	})

	// Process stylesheets
	doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
		if href, exists := sel.Attr("href"); exists {
			targetURL := s.resolveURL(href, sourceURL)
			if targetURL != "" {
				s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
			}
		}
	})

	// Process scripts
	doc.Find("script").Each(func(i int, sel *goquery.Selection) {
		if src, exists := sel.Attr("src"); exists {
			targetURL := s.resolveURL(src, sourceURL)
			if targetURL != "" {
				s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
			}
		}
	})

	return foundURLs
}

// parseCSS extracts URLs from CSS content
func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
	// Simple regex-based parsing for CSS imports and url() references
	// This is a simplified approach; a proper CSS parser would be better
	// for production use
	scanner := bufio.NewScanner(resp.Body)
	for scanner.Scan() {
		line := scanner.Text()

		// Look for url() references
		urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
		for _, match := range urlMatches {
			if len(match) > 1 {
				// Remove quotes if present
				urlValue := strings.Trim(match[1], "'\"")
				targetURL := s.resolveURL(urlValue, sourceURL)
				if targetURL != "" {
					s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
				}
			}
		}

		// Look for @import statements
		importMatches := importRegexp.FindAllStringSubmatch(line, -1)
		for _, match := range importMatches {
			if len(match) > 1 {
				// Remove quotes if present
				urlValue := strings.Trim(match[1], "'\"")
				targetURL := s.resolveURL(urlValue, sourceURL)
				if targetURL != "" {
					s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
				}
			}
		}
	}
}

// resolveURL resolves a relative URL against a base URL
func (s *Scraper) resolveURL(href, sourceURL string) string {
	// Skip empty URLs, anchors, and javascript
	if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
		return ""
	}

	// Skip non-HTTP protocols like mailto:, tel:, etc.
	if strings.HasPrefix(href, "mailto:") ||
		strings.HasPrefix(href, "tel:") ||
		strings.HasPrefix(href, "sms:") ||
		strings.HasPrefix(href, "ftp:") ||
		strings.HasPrefix(href, "file:") {
		return ""
	}

	sourceParsed, err := url.Parse(sourceURL)
	if err != nil {
		return ""
	}

	targetParsed, err := url.Parse(href)
	if err != nil {
		return ""
	}

	// If the scheme is not HTTP/HTTPS, skip it
	if targetParsed.Scheme != "" &&
		targetParsed.Scheme != "http" &&
		targetParsed.Scheme != "https" {
		return ""
	}

	resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
	return resolvedURL
}

// checkExternalURL performs a HEAD request to check external URLs
func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
	// Skip external links if internalOnly is set
	if s.internalOnly {
		return
	}

	// Check if URL was already checked
	s.mu.Lock()
	if result, exists := s.checkedURLs[targetURL]; exists {
		// Add the existing result with the current source URL
		result.SourceURL = sourceURL
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}
	s.mu.Unlock()

	req, err := http.NewRequest("HEAD", targetURL, nil)
	if err != nil {
		result := Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Error:      fmt.Sprintf("Invalid URL: %v", err),
			Type:       "external-link",
			IsExternal: true,
		}
		s.mu.Lock()
		s.checkedURLs[targetURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}

	resp, err := s.client.Do(req)
	if err != nil {
		result := Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Error:      err.Error(),
			Type:       "external-link",
			IsExternal: true,
		}
		s.mu.Lock()
		s.checkedURLs[targetURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}
	defer resp.Body.Close()

	var result Result
	if resp.StatusCode >= 400 {
		result = Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
			Type:       "external-link",
			IsExternal: true,
		}
	} else {
		result = Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Type:       "external-link",
			IsExternal: true,
		}
	}

	s.mu.Lock()
	s.checkedURLs[targetURL] = result
	s.mu.Unlock()
	s.addResult(results, result)
}

// checkResource checks if a resource URL is accessible
func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
	// Parse the target URL to determine if it's internal or external
	targetParsed, err := url.Parse(targetURL)
	if err != nil {
		result := Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Error:      fmt.Sprintf("Invalid URL: %v", err),
			Type:       resourceType,
			IsExternal: false,
		}
		s.mu.Lock()
		s.checkedURLs[targetURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}

	isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""

	// Skip external resources if internalOnly is set
	if isExternal && s.internalOnly {
		return
	}

	// Check if URL was already checked
	s.mu.Lock()
	if result, exists := s.checkedURLs[targetURL]; exists {
		// Update with current source and type if needed
		result.SourceURL = sourceURL
		result.Type = resourceType
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}
	s.mu.Unlock()

	req, err := http.NewRequest("HEAD", targetURL, nil)
	if err != nil {
		result := Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Error:      fmt.Sprintf("Invalid URL: %v", err),
			Type:       resourceType,
			IsExternal: isExternal,
		}
		s.mu.Lock()
		s.checkedURLs[targetURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}

	resp, err := s.client.Do(req)
	if err != nil {
		result := Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Error:      err.Error(),
			Type:       resourceType,
			IsExternal: isExternal,
		}
		s.mu.Lock()
		s.checkedURLs[targetURL] = result
		s.mu.Unlock()
		s.addResult(results, result)
		return
	}
	defer resp.Body.Close()

	var result Result
	if resp.StatusCode >= 400 {
		result = Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Error:      fmt.Sprintf("HTTP Error: %s", resp.Status),
			Type:       resourceType,
			IsExternal: isExternal,
		}
	} else {
		result = Result{
			URL:        targetURL,
			SourceURL:  sourceURL,
			Status:     resp.StatusCode,
			Type:       resourceType,
			IsExternal: isExternal,
		}
	}

	s.mu.Lock()
	s.checkedURLs[targetURL] = result
	s.mu.Unlock()
	s.addResult(results, result)
}

// addResult adds a result to the appropriate list
func (s *Scraper) addResult(results *Results, result Result) {
	s.mu.Lock()
	defer s.mu.Unlock()

	if result.Error != "" {
		results.Errors = append(results.Errors, result)
	} else {
		results.Successes = append(results.Successes, result)
	}
}