package scraper import ( "bufio" "fmt" "net/http" "net/url" "regexp" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" ) var ( urlRegexp = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`) importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`) ) // Result represents a URL check result type Result struct { URL string `json:"url"` SourceURL string `json:"source_url,omitempty"` Status int `json:"status"` Error string `json:"error,omitempty"` Type string `json:"type"` // link, image, script, stylesheet, css-import IsExternal bool `json:"is_external"` } // Results is a collection of Result type Results struct { BaseURL string `json:"base_url"` Errors []Result `json:"errors"` Successes []Result `json:"successes"` Total int `json:"total"` } // QueueItem represents a URL to be processed along with its source type QueueItem struct { URL string SourceURL string Depth int } // Scraper handles website crawling and link checking type Scraper struct { client *http.Client concurrency int depth int verbose bool internalOnly bool visitedURLs map[string]bool // URLs visited for crawling checkedURLs map[string]Result // URLs already checked to avoid duplicate requests mu sync.Mutex } // Option is a function option for the Scraper type Option func(*Scraper) // WithConcurrency sets the concurrency level func WithConcurrency(concurrency int) Option { return func(s *Scraper) { s.concurrency = concurrency } } // WithDepth sets the maximum crawling depth func WithDepth(depth int) Option { return func(s *Scraper) { s.depth = depth } } // WithTimeout sets the timeout for HTTP requests func WithTimeout(timeoutSec int) Option { return func(s *Scraper) { s.client.Timeout = time.Duration(timeoutSec) * time.Second } } // WithVerbose enables verbose output func WithVerbose(verbose bool) Option { return func(s *Scraper) { s.verbose = verbose } } // WithInternalOnly sets whether to only check internal links func WithInternalOnly(internalOnly bool) Option { return func(s *Scraper) { s.internalOnly = internalOnly } } // New creates a new Scraper with the given options func New(options ...Option) *Scraper { s := &Scraper{ client: &http.Client{ Timeout: 10 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("too many redirects") } return nil }, }, concurrency: 10, depth: 3, visitedURLs: make(map[string]bool), checkedURLs: make(map[string]Result), } for _, option := range options { option(s) } return s } // Scan starts the website crawling process func (s *Scraper) Scan(baseURL string) (*Results, error) { parsedURL, err := url.Parse(baseURL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } // Ensure the base URL has a scheme if parsedURL.Scheme == "" { parsedURL.Scheme = "https" baseURL = parsedURL.String() } // Store the base hostname for distinguishing internal vs external links baseHostname := parsedURL.Hostname() results := &Results{ BaseURL: baseURL, } // Create a waitgroup to track active workers var wg sync.WaitGroup // Create a channel to communicate URLs to process queue := make(chan QueueItem, 1000) // Create a channel to track active URL processing activeCount := make(chan int, 1) activeCount <- 1 // Start with 1 active URL (the base URL) // Start worker pool for range s.concurrency { wg.Add(1) go func() { defer wg.Done() for item := range queue { s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount) } }() } // Initial URL to process - the source is itself for the initial URL queue <- QueueItem{ URL: baseURL, SourceURL: baseURL, Depth: 0, } // Monitor active count - when it reaches 0, we're done go func() { for { count := <-activeCount if count <= 0 { close(queue) return } activeCount <- count } }() // Wait for workers to finish wg.Wait() results.Total = len(results.Errors) + len(results.Successes) return results, nil } // processURL processes a single URL func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) { // Decrement active count when done defer func() { count := <-activeCount activeCount <- count - 1 }() // Check if we've already visited this URL (for crawling) or exceeded max depth s.mu.Lock() if s.visitedURLs[currentURL] || depth > s.depth { s.mu.Unlock() return } s.visitedURLs[currentURL] = true // If we've already checked this URL's status, reuse the result if result, exists := s.checkedURLs[currentURL]; exists { // Always use the provided source URL result.SourceURL = sourceURL s.mu.Unlock() s.addResult(results, result) // We still need to parse HTML/CSS content even if we've checked the URL before // But only if it was successful if result.Error == "" && result.Status < 400 { // Continue with content parsing... } else { return } } else { s.mu.Unlock() } if s.verbose { fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL) } // Parse the current URL currentParsed, err := url.Parse(currentURL) if err != nil { result := Result{ URL: currentURL, SourceURL: sourceURL, Error: fmt.Sprintf("Invalid URL: %v", err), Type: "link", IsExternal: false, } s.mu.Lock() s.checkedURLs[currentURL] = result s.mu.Unlock() s.addResult(results, result) return } // Determine if the URL is internal or external isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != "" // Skip external links processing if internalOnly is set if isExternal && s.internalOnly { return } // Process external links differently from internal links if isExternal { s.checkExternalURL(currentURL, sourceURL, results) return } // Internal URL, check and crawl resp, err := s.client.Get(currentURL) if err != nil { result := Result{ URL: currentURL, SourceURL: sourceURL, Error: err.Error(), Type: "link", IsExternal: isExternal, } s.mu.Lock() s.checkedURLs[currentURL] = result s.mu.Unlock() s.addResult(results, result) return } defer resp.Body.Close() // Add the result var result Result if resp.StatusCode >= 400 { result = Result{ URL: currentURL, SourceURL: sourceURL, Status: resp.StatusCode, Error: fmt.Sprintf("HTTP Error: %s", resp.Status), Type: "link", IsExternal: isExternal, } s.mu.Lock() s.checkedURLs[currentURL] = result s.mu.Unlock() s.addResult(results, result) return } else { result = Result{ URL: currentURL, SourceURL: sourceURL, Status: resp.StatusCode, Type: "link", IsExternal: isExternal, } s.mu.Lock() s.checkedURLs[currentURL] = result s.mu.Unlock() s.addResult(results, result) } // Only parse HTML and CSS from internal links contentType := resp.Header.Get("Content-Type") if strings.Contains(contentType, "text/html") { // Use the base hostname to create a base URL for this site baseURL := "" if currentParsed.Scheme != "" && currentParsed.Host != "" { baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host) } foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results) // Add all found URLs to the queue and increment active count if len(foundURLs) > 0 { count := <-activeCount count += len(foundURLs) activeCount <- count for _, url := range foundURLs { queue <- QueueItem{ URL: url, SourceURL: currentURL, // The source URL is the current page we're processing Depth: depth + 1, } } } } else if strings.Contains(contentType, "text/css") { // Use the base hostname to create a base URL for this site baseURL := "" if currentParsed.Scheme != "" && currentParsed.Host != "" { baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host) } s.parseCSS(currentURL, resp, baseURL, baseHostname, results) } } // parseHTML extracts links and other resources from HTML func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string { foundURLs := []string{} doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { s.addResult(results, Result{ URL: sourceURL, SourceURL: sourceURL, // Use self as source for error Error: fmt.Sprintf("Failed to parse HTML: %v", err), Status: resp.StatusCode, Type: "html", IsExternal: false, }) return foundURLs } // Process links (a href) doc.Find("a").Each(func(i int, sel *goquery.Selection) { if href, exists := sel.Attr("href"); exists { targetURL := s.resolveURL(href, sourceURL) if targetURL != "" { linkParsed, err := url.Parse(targetURL) if err == nil { isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != "" // Only add internal links to the crawl queue if !isExternal { foundURLs = append(foundURLs, targetURL) } else if !s.internalOnly { // Check external links only if internalOnly is false s.checkExternalURL(targetURL, sourceURL, results) } } } } }) // Process images doc.Find("img").Each(func(i int, sel *goquery.Selection) { if src, exists := sel.Attr("src"); exists { targetURL := s.resolveURL(src, sourceURL) if targetURL != "" { s.checkResource(targetURL, sourceURL, "image", baseHostname, results) } } }) // Process stylesheets doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) { if href, exists := sel.Attr("href"); exists { targetURL := s.resolveURL(href, sourceURL) if targetURL != "" { s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results) } } }) // Process scripts doc.Find("script").Each(func(i int, sel *goquery.Selection) { if src, exists := sel.Attr("src"); exists { targetURL := s.resolveURL(src, sourceURL) if targetURL != "" { s.checkResource(targetURL, sourceURL, "script", baseHostname, results) } } }) return foundURLs } // parseCSS extracts URLs from CSS content func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) { // Simple regex-based parsing for CSS imports and url() references // This is a simplified approach; a proper CSS parser would be better // for production use scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { line := scanner.Text() // Look for url() references urlMatches := urlRegexp.FindAllStringSubmatch(line, -1) for _, match := range urlMatches { if len(match) > 1 { // Remove quotes if present urlValue := strings.Trim(match[1], "'\"") targetURL := s.resolveURL(urlValue, sourceURL) if targetURL != "" { s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results) } } } // Look for @import statements importMatches := importRegexp.FindAllStringSubmatch(line, -1) for _, match := range importMatches { if len(match) > 1 { // Remove quotes if present urlValue := strings.Trim(match[1], "'\"") targetURL := s.resolveURL(urlValue, sourceURL) if targetURL != "" { s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results) } } } } } // resolveURL resolves a relative URL against a base URL func (s *Scraper) resolveURL(href, sourceURL string) string { // Skip empty URLs, anchors, and javascript if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") { return "" } // Skip non-HTTP protocols like mailto:, tel:, etc. if strings.HasPrefix(href, "mailto:") || strings.HasPrefix(href, "tel:") || strings.HasPrefix(href, "sms:") || strings.HasPrefix(href, "ftp:") || strings.HasPrefix(href, "file:") { return "" } sourceParsed, err := url.Parse(sourceURL) if err != nil { return "" } targetParsed, err := url.Parse(href) if err != nil { return "" } // If the scheme is not HTTP/HTTPS, skip it if targetParsed.Scheme != "" && targetParsed.Scheme != "http" && targetParsed.Scheme != "https" { return "" } resolvedURL := sourceParsed.ResolveReference(targetParsed).String() return resolvedURL } // checkExternalURL performs a HEAD request to check external URLs func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) { // Skip external links if internalOnly is set if s.internalOnly { return } // Check if URL was already checked s.mu.Lock() if result, exists := s.checkedURLs[targetURL]; exists { // Add the existing result with the current source URL result.SourceURL = sourceURL s.mu.Unlock() s.addResult(results, result) return } s.mu.Unlock() req, err := http.NewRequest("HEAD", targetURL, nil) if err != nil { result := Result{ URL: targetURL, SourceURL: sourceURL, Error: fmt.Sprintf("Invalid URL: %v", err), Type: "external-link", IsExternal: true, } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) return } resp, err := s.client.Do(req) if err != nil { result := Result{ URL: targetURL, SourceURL: sourceURL, Error: err.Error(), Type: "external-link", IsExternal: true, } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) return } defer resp.Body.Close() var result Result if resp.StatusCode >= 400 { result = Result{ URL: targetURL, SourceURL: sourceURL, Status: resp.StatusCode, Error: fmt.Sprintf("HTTP Error: %s", resp.Status), Type: "external-link", IsExternal: true, } } else { result = Result{ URL: targetURL, SourceURL: sourceURL, Status: resp.StatusCode, Type: "external-link", IsExternal: true, } } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) } // checkResource checks if a resource URL is accessible func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) { // Parse the target URL to determine if it's internal or external targetParsed, err := url.Parse(targetURL) if err != nil { result := Result{ URL: targetURL, SourceURL: sourceURL, Error: fmt.Sprintf("Invalid URL: %v", err), Type: resourceType, IsExternal: false, } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) return } isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != "" // Skip external resources if internalOnly is set if isExternal && s.internalOnly { return } // Check if URL was already checked s.mu.Lock() if result, exists := s.checkedURLs[targetURL]; exists { // Update with current source and type if needed result.SourceURL = sourceURL result.Type = resourceType s.mu.Unlock() s.addResult(results, result) return } s.mu.Unlock() req, err := http.NewRequest("HEAD", targetURL, nil) if err != nil { result := Result{ URL: targetURL, SourceURL: sourceURL, Error: fmt.Sprintf("Invalid URL: %v", err), Type: resourceType, IsExternal: isExternal, } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) return } resp, err := s.client.Do(req) if err != nil { result := Result{ URL: targetURL, SourceURL: sourceURL, Error: err.Error(), Type: resourceType, IsExternal: isExternal, } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) return } defer resp.Body.Close() var result Result if resp.StatusCode >= 400 { result = Result{ URL: targetURL, SourceURL: sourceURL, Status: resp.StatusCode, Error: fmt.Sprintf("HTTP Error: %s", resp.Status), Type: resourceType, IsExternal: isExternal, } } else { result = Result{ URL: targetURL, SourceURL: sourceURL, Status: resp.StatusCode, Type: resourceType, IsExternal: isExternal, } } s.mu.Lock() s.checkedURLs[targetURL] = result s.mu.Unlock() s.addResult(results, result) } // addResult adds a result to the appropriate list func (s *Scraper) addResult(results *Results, result Result) { s.mu.Lock() defer s.mu.Unlock() if result.Error != "" { results.Errors = append(results.Errors, result) } else { results.Successes = append(results.Successes, result) } }