This commit is contained in:
commit
0ef15167d5
28 changed files with 2789 additions and 0 deletions
77
pkg/cli/root.go
Normal file
77
pkg/cli/root.go
Normal file
|
@ -0,0 +1,77 @@
|
|||
package cli
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/reporter"
|
||||
"git.nakama.town/fmartingr/dharma/pkg/scraper"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
format string
|
||||
concurrency int
|
||||
depth int
|
||||
timeout int
|
||||
verbose bool
|
||||
internalOnly bool
|
||||
)
|
||||
|
||||
// rootCmd represents the base command when called without any subcommands
|
||||
var rootCmd = &cobra.Command{
|
||||
Use: "dharma [URL]",
|
||||
Short: "Scrape websites and check for broken links and references",
|
||||
Long: `Dharma is a website link checker tool that crawls a website to find broken links,
|
||||
images, CSS references, and more. It generates a report of all issues found.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
url := args[0]
|
||||
|
||||
// Only print status message when using pretty format
|
||||
if format == "pretty" {
|
||||
fmt.Printf("Scanning website: %s\n", url)
|
||||
} else {
|
||||
// Force verbose off for non-pretty formats
|
||||
verbose = false
|
||||
}
|
||||
|
||||
// Create a new scraper
|
||||
s := scraper.New(
|
||||
scraper.WithConcurrency(concurrency),
|
||||
scraper.WithDepth(depth),
|
||||
scraper.WithTimeout(timeout),
|
||||
scraper.WithVerbose(verbose),
|
||||
scraper.WithInternalOnly(internalOnly),
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Generate report
|
||||
r, err := reporter.New(format)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return r.Generate(results, os.Stdout)
|
||||
},
|
||||
}
|
||||
|
||||
// Execute adds all child commands to the root command and sets flags appropriately.
|
||||
// This is called by main.main(). It only needs to happen once to the rootCmd.
|
||||
func Execute() error {
|
||||
return rootCmd.Execute()
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.Flags().StringVarP(&format, "format", "f", "pretty", "Output format (pretty, json, csv)")
|
||||
rootCmd.Flags().IntVarP(&concurrency, "concurrency", "c", 10, "Number of concurrent requests")
|
||||
rootCmd.Flags().IntVarP(&depth, "depth", "d", 3, "Maximum depth to crawl")
|
||||
rootCmd.Flags().IntVarP(&timeout, "timeout", "t", 10, "Timeout in seconds for each request")
|
||||
rootCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output")
|
||||
rootCmd.Flags().BoolVarP(&internalOnly, "internal-only", "i", false, "Only check internal links (same hostname)")
|
||||
}
|
196
pkg/cli/testsite_test.go
Normal file
196
pkg/cli/testsite_test.go
Normal file
|
@ -0,0 +1,196 @@
|
|||
package cli
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/reporter"
|
||||
"git.nakama.town/fmartingr/dharma/pkg/scraper"
|
||||
"git.nakama.town/fmartingr/dharma/pkg/testutil"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func TestIntegrationTestsite(t *testing.T) {
|
||||
// Skip this test if running in CI environment or if it's a short test run
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping integration test in short mode")
|
||||
}
|
||||
|
||||
// Start the testsite server
|
||||
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to start test server: %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Test cases for CLI invocation with different flags
|
||||
testCases := []struct {
|
||||
name string
|
||||
args []string
|
||||
wantOutput bool
|
||||
wantErrors bool
|
||||
}{
|
||||
{
|
||||
name: "Basic scan",
|
||||
args: []string{serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "JSON output",
|
||||
args: []string{"--format", "json", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "Internal links only",
|
||||
args: []string{"--internal-only", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "Custom depth",
|
||||
args: []string{"--depth", "1", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "Custom concurrency",
|
||||
args: []string{"--concurrency", "1", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "Custom timeout",
|
||||
args: []string{"--timeout", "5", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "CSV output",
|
||||
args: []string{"--format", "csv", serverURL},
|
||||
wantOutput: true,
|
||||
wantErrors: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Temporarily replace stdout to capture output
|
||||
oldStdout := os.Stdout
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stdout = w
|
||||
|
||||
// Save and restore rootCmd so state doesn't leak between test cases
|
||||
origRootCmd := rootCmd
|
||||
defer func() {
|
||||
rootCmd = origRootCmd
|
||||
os.Stdout = oldStdout
|
||||
}()
|
||||
|
||||
// Reset rootCmd for this test
|
||||
rootCmd = createRootCmd()
|
||||
|
||||
// Set the command line arguments
|
||||
os.Args = append([]string{"dharma"}, tc.args...)
|
||||
|
||||
// Execute the CLI
|
||||
err := Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to execute command: %v", err)
|
||||
}
|
||||
|
||||
// Restore stdout and read the output
|
||||
w.Close()
|
||||
var buf bytes.Buffer
|
||||
_, err = buf.ReadFrom(r)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to read output: %v", err)
|
||||
}
|
||||
output := buf.String()
|
||||
|
||||
// Check if we got any output
|
||||
if tc.wantOutput && output == "" {
|
||||
t.Errorf("Expected output but got none")
|
||||
}
|
||||
|
||||
// Check if errors were reported for known broken links
|
||||
if tc.wantErrors {
|
||||
if !strings.Contains(output, "not_found.html") {
|
||||
t.Errorf("Expected not_found.html to be reported as broken in output")
|
||||
}
|
||||
}
|
||||
|
||||
// Specific checks for different formats
|
||||
if strings.Contains(tc.name, "JSON") {
|
||||
if !strings.Contains(output, `"url":`) {
|
||||
t.Errorf("Expected JSON output with 'url' field")
|
||||
}
|
||||
} else if strings.Contains(tc.name, "CSV") {
|
||||
if !strings.Contains(output, "Status,Type,URL,Source URL,Error") {
|
||||
t.Errorf("Expected CSV header in output")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// createRootCmd returns a fresh instance of the root command
|
||||
func createRootCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "dharma [URL]",
|
||||
Short: "Scrape websites and check for broken links and references",
|
||||
Long: `Dharma is a website link checker tool that crawls a website to find broken links, images, CSS references, and more. It generates a report of all issues found.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
url := args[0]
|
||||
|
||||
// Only print status message when using pretty format
|
||||
if format == "pretty" {
|
||||
if !strings.HasPrefix(url, "file://") {
|
||||
fmt.Printf("Scanning website: %s\n", url)
|
||||
} else {
|
||||
fmt.Printf("Scanning local directory: %s\n", strings.TrimPrefix(url, "file://"))
|
||||
}
|
||||
} else {
|
||||
// Force verbose off for non-pretty formats
|
||||
verbose = false
|
||||
}
|
||||
|
||||
// Create a new scraper
|
||||
s := scraper.New(
|
||||
scraper.WithConcurrency(concurrency),
|
||||
scraper.WithDepth(depth),
|
||||
scraper.WithTimeout(timeout),
|
||||
scraper.WithVerbose(verbose),
|
||||
scraper.WithInternalOnly(internalOnly),
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Generate report
|
||||
r, err := reporter.New(format)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return r.Generate(results, os.Stdout)
|
||||
},
|
||||
}
|
||||
|
||||
cmd.Flags().StringVarP(&format, "format", "f", "pretty", "Output format (pretty, json, csv)")
|
||||
cmd.Flags().IntVarP(&concurrency, "concurrency", "c", 10, "Number of concurrent requests")
|
||||
cmd.Flags().IntVarP(&depth, "depth", "d", 3, "Maximum depth to crawl")
|
||||
cmd.Flags().IntVarP(&timeout, "timeout", "t", 10, "Timeout in seconds for each request")
|
||||
cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output")
|
||||
cmd.Flags().BoolVarP(&internalOnly, "internal-only", "i", false, "Only check internal links (same hostname)")
|
||||
|
||||
return cmd
|
||||
}
|
192
pkg/reporter/reporter.go
Normal file
192
pkg/reporter/reporter.go
Normal file
|
@ -0,0 +1,192 @@
|
|||
package reporter
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/scraper"
|
||||
"github.com/fatih/color"
|
||||
)
|
||||
|
||||
// Reporter is an interface for report generators
|
||||
type Reporter interface {
|
||||
Generate(results *scraper.Results, writer io.Writer) error
|
||||
}
|
||||
|
||||
// New creates a new reporter based on the format
|
||||
func New(format string) (Reporter, error) {
|
||||
switch strings.ToLower(format) {
|
||||
case "pretty":
|
||||
return &PrettyReporter{}, nil
|
||||
case "json":
|
||||
return &JSONReporter{}, nil
|
||||
case "csv":
|
||||
return &CSVReporter{}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported format: %s", format)
|
||||
}
|
||||
}
|
||||
|
||||
// PrettyReporter generates a human-readable report for terminal
|
||||
type PrettyReporter struct{}
|
||||
|
||||
// Generate generates a pretty report
|
||||
func (r *PrettyReporter) Generate(results *scraper.Results, writer io.Writer) error {
|
||||
red := color.New(color.FgRed).SprintFunc()
|
||||
green := color.New(color.FgGreen).SprintFunc()
|
||||
yellow := color.New(color.FgYellow).SprintFunc()
|
||||
blue := color.New(color.FgBlue).SprintFunc()
|
||||
cyan := color.New(color.FgCyan).SprintFunc()
|
||||
|
||||
// Count internal vs external links
|
||||
countInternalSuccess := 0
|
||||
countInternalErrors := 0
|
||||
countExternalSuccess := 0
|
||||
countExternalErrors := 0
|
||||
|
||||
for _, result := range results.Successes {
|
||||
if result.IsExternal {
|
||||
countExternalSuccess++
|
||||
} else {
|
||||
countInternalSuccess++
|
||||
}
|
||||
}
|
||||
|
||||
for _, result := range results.Errors {
|
||||
if result.IsExternal {
|
||||
countExternalErrors++
|
||||
} else {
|
||||
countInternalErrors++
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(writer, "Website scan report for: %s\n", blue(results.BaseURL))
|
||||
fmt.Fprintf(writer, "Scanned at: %s\n", time.Now().Format(time.RFC1123))
|
||||
fmt.Fprintf(writer, "Total resources checked: %d\n", results.Total)
|
||||
fmt.Fprintf(writer, "Success: %s, Errors: %s\n",
|
||||
green(len(results.Successes)),
|
||||
red(len(results.Errors)))
|
||||
fmt.Fprintf(writer, "Internal links: %s success, %s errors\n",
|
||||
green(countInternalSuccess),
|
||||
red(countInternalErrors))
|
||||
fmt.Fprintf(writer, "External links: %s success, %s errors\n\n",
|
||||
green(countExternalSuccess),
|
||||
red(countExternalErrors))
|
||||
|
||||
if len(results.Errors) == 0 {
|
||||
fmt.Fprintf(writer, "%s No errors found!\n", green("✓"))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Group errors by internal/external
|
||||
internalErrors := []scraper.Result{}
|
||||
externalErrors := []scraper.Result{}
|
||||
|
||||
for _, result := range results.Errors {
|
||||
if result.IsExternal {
|
||||
externalErrors = append(externalErrors, result)
|
||||
} else {
|
||||
internalErrors = append(internalErrors, result)
|
||||
}
|
||||
}
|
||||
|
||||
// Print internal errors first if we have any
|
||||
if len(internalErrors) > 0 {
|
||||
fmt.Fprintln(writer, "Errors found:")
|
||||
|
||||
for _, result := range internalErrors {
|
||||
status := fmt.Sprintf("%d", result.Status)
|
||||
if result.Status == 0 {
|
||||
status = "ERR"
|
||||
}
|
||||
|
||||
fmt.Fprintf(writer, "%-6s (%-10s) %s [from: %s]\n",
|
||||
red(status),
|
||||
yellow(result.Type),
|
||||
result.URL,
|
||||
result.SourceURL,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Print external errors if we have any
|
||||
if len(externalErrors) > 0 {
|
||||
if len(internalErrors) > 0 {
|
||||
fmt.Fprintln(writer, "")
|
||||
}
|
||||
fmt.Fprintln(writer, "External Errors:")
|
||||
fmt.Fprintln(writer, strings.Repeat("-", 80))
|
||||
fmt.Fprintf(writer, "%-6s | %-10s | %s | %s\n", "Status", "Type", "URL", "Source")
|
||||
fmt.Fprintln(writer, strings.Repeat("-", 80))
|
||||
|
||||
for _, result := range externalErrors {
|
||||
status := fmt.Sprintf("%d", result.Status)
|
||||
if result.Status == 0 {
|
||||
status = "ERR"
|
||||
}
|
||||
|
||||
fmt.Fprintf(writer, "%-6s | %-10s | %s | %s\n",
|
||||
red(status),
|
||||
cyan(result.Type),
|
||||
result.URL,
|
||||
result.SourceURL,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// JSONReporter generates a JSON report
|
||||
type JSONReporter struct{}
|
||||
|
||||
// Generate generates a JSON report
|
||||
func (r *JSONReporter) Generate(results *scraper.Results, writer io.Writer) error {
|
||||
return json.NewEncoder(writer).Encode(results)
|
||||
}
|
||||
|
||||
// CSVReporter generates a CSV report
|
||||
type CSVReporter struct{}
|
||||
|
||||
// Generate generates a CSV report
|
||||
func (r *CSVReporter) Generate(results *scraper.Results, writer io.Writer) error {
|
||||
csvWriter := csv.NewWriter(writer)
|
||||
defer csvWriter.Flush()
|
||||
|
||||
// Write header
|
||||
if err := csvWriter.Write([]string{"Status", "Type", "URL", "Source URL", "Error"}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Write errors
|
||||
for _, result := range results.Errors {
|
||||
status := fmt.Sprintf("%d", result.Status)
|
||||
if result.Status == 0 {
|
||||
status = "ERROR"
|
||||
}
|
||||
|
||||
if err := csvWriter.Write([]string{
|
||||
status,
|
||||
result.Type,
|
||||
result.URL,
|
||||
result.SourceURL,
|
||||
result.Error,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper function to truncate strings
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen-3] + "..."
|
||||
}
|
206
pkg/reporter/reporter_test.go
Normal file
206
pkg/reporter/reporter_test.go
Normal file
|
@ -0,0 +1,206 @@
|
|||
package reporter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/scraper"
|
||||
)
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
format string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "Pretty format",
|
||||
format: "pretty",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "JSON format",
|
||||
format: "json",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "CSV format",
|
||||
format: "csv",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "Unsupported format",
|
||||
format: "xml",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := New(tt.format)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("New() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr && got == nil {
|
||||
t.Errorf("New() = nil, want non-nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJSONReporter_Generate(t *testing.T) {
|
||||
// Create test results
|
||||
results := &scraper.Results{
|
||||
BaseURL: "https://example.com",
|
||||
Errors: []scraper.Result{
|
||||
{
|
||||
URL: "https://example.com/error",
|
||||
SourceURL: "https://example.com",
|
||||
Status: 404,
|
||||
Error: "HTTP Error: 404 Not Found",
|
||||
Type: "link",
|
||||
},
|
||||
},
|
||||
Successes: []scraper.Result{
|
||||
{
|
||||
URL: "https://example.com/success",
|
||||
SourceURL: "https://example.com",
|
||||
Status: 200,
|
||||
Type: "link",
|
||||
},
|
||||
},
|
||||
Total: 2,
|
||||
}
|
||||
|
||||
// Create reporter and buffer
|
||||
reporter := &JSONReporter{}
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
// Generate report
|
||||
if err := reporter.Generate(results, buf); err != nil {
|
||||
t.Fatalf("Generate() error = %v", err)
|
||||
}
|
||||
|
||||
// Parse output
|
||||
var output scraper.Results
|
||||
if err := json.Unmarshal(buf.Bytes(), &output); err != nil {
|
||||
t.Fatalf("Failed to parse JSON: %v", err)
|
||||
}
|
||||
|
||||
// Verify output
|
||||
if output.BaseURL != results.BaseURL {
|
||||
t.Errorf("BaseURL = %v, want %v", output.BaseURL, results.BaseURL)
|
||||
}
|
||||
if len(output.Errors) != len(results.Errors) {
|
||||
t.Errorf("Errors count = %v, want %v", len(output.Errors), len(results.Errors))
|
||||
}
|
||||
if len(output.Successes) != len(results.Successes) {
|
||||
t.Errorf("Successes count = %v, want %v", len(output.Successes), len(results.Successes))
|
||||
}
|
||||
if output.Total != results.Total {
|
||||
t.Errorf("Total = %v, want %v", output.Total, results.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCSVReporter_Generate(t *testing.T) {
|
||||
// Create test results
|
||||
results := &scraper.Results{
|
||||
BaseURL: "https://example.com",
|
||||
Errors: []scraper.Result{
|
||||
{
|
||||
URL: "https://example.com/error",
|
||||
SourceURL: "https://example.com",
|
||||
Status: 404,
|
||||
Error: "HTTP Error: 404 Not Found",
|
||||
Type: "link",
|
||||
},
|
||||
},
|
||||
Successes: []scraper.Result{},
|
||||
Total: 1,
|
||||
}
|
||||
|
||||
// Create reporter and buffer
|
||||
reporter := &CSVReporter{}
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
// Generate report
|
||||
if err := reporter.Generate(results, buf); err != nil {
|
||||
t.Fatalf("Generate() error = %v", err)
|
||||
}
|
||||
|
||||
// Verify output
|
||||
lines := strings.Split(strings.TrimSpace(buf.String()), "\n")
|
||||
if len(lines) != 2 { // Header + 1 error
|
||||
t.Errorf("Expected 2 lines, got %d", len(lines))
|
||||
}
|
||||
|
||||
// Check header
|
||||
expectedHeader := "Status,Type,URL,Source URL,Error"
|
||||
if lines[0] != expectedHeader {
|
||||
t.Errorf("Header = %v, want %v", lines[0], expectedHeader)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrettyReporter_Generate(t *testing.T) {
|
||||
// Test with errors
|
||||
results := &scraper.Results{
|
||||
BaseURL: "https://example.com",
|
||||
Errors: []scraper.Result{
|
||||
{
|
||||
URL: "https://example.com/error",
|
||||
SourceURL: "https://example.com",
|
||||
Status: 404,
|
||||
Error: "HTTP Error: 404 Not Found",
|
||||
Type: "link",
|
||||
},
|
||||
},
|
||||
Successes: []scraper.Result{
|
||||
{
|
||||
URL: "https://example.com/success",
|
||||
SourceURL: "https://example.com",
|
||||
Status: 200,
|
||||
Type: "link",
|
||||
},
|
||||
},
|
||||
Total: 2,
|
||||
}
|
||||
|
||||
// Create reporter and buffer
|
||||
reporter := &PrettyReporter{}
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
// Generate report
|
||||
if err := reporter.Generate(results, buf); err != nil {
|
||||
t.Fatalf("Generate() error = %v", err)
|
||||
}
|
||||
|
||||
// Check that output contains key sections
|
||||
output := buf.String()
|
||||
if !strings.Contains(output, "Website scan report for") {
|
||||
t.Error("Output doesn't contain report title")
|
||||
}
|
||||
if !strings.Contains(output, "Internal Errors:") {
|
||||
t.Error("Output doesn't contain errors section")
|
||||
}
|
||||
|
||||
// Test with no errors
|
||||
results = &scraper.Results{
|
||||
BaseURL: "https://example.com",
|
||||
Errors: []scraper.Result{},
|
||||
Successes: []scraper.Result{},
|
||||
Total: 0,
|
||||
}
|
||||
|
||||
buf = &bytes.Buffer{}
|
||||
if err := reporter.Generate(results, buf); err != nil {
|
||||
t.Fatalf("Generate() error = %v", err)
|
||||
}
|
||||
|
||||
output = buf.String()
|
||||
if !strings.Contains(output, "No errors found") {
|
||||
t.Error("Output doesn't contain 'No errors found' message")
|
||||
}
|
||||
}
|
669
pkg/scraper/scraper.go
Normal file
669
pkg/scraper/scraper.go
Normal file
|
@ -0,0 +1,669 @@
|
|||
package scraper
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var (
|
||||
urlRegexp = regexp.MustCompile(`url\(['"]?([^'")]+)['"]?\)`)
|
||||
importRegexp = regexp.MustCompile(`@import\s+['"]([^'"]+)['"]`)
|
||||
)
|
||||
|
||||
// Result represents a URL check result
|
||||
type Result struct {
|
||||
URL string `json:"url"`
|
||||
SourceURL string `json:"source_url,omitempty"`
|
||||
Status int `json:"status"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Type string `json:"type"` // link, image, script, stylesheet, css-import
|
||||
IsExternal bool `json:"is_external"`
|
||||
}
|
||||
|
||||
// Results is a collection of Result
|
||||
type Results struct {
|
||||
BaseURL string `json:"base_url"`
|
||||
Errors []Result `json:"errors"`
|
||||
Successes []Result `json:"successes"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
|
||||
// QueueItem represents a URL to be processed along with its source
|
||||
type QueueItem struct {
|
||||
URL string
|
||||
SourceURL string
|
||||
Depth int
|
||||
}
|
||||
|
||||
// Scraper handles website crawling and link checking
|
||||
type Scraper struct {
|
||||
client *http.Client
|
||||
concurrency int
|
||||
depth int
|
||||
verbose bool
|
||||
internalOnly bool
|
||||
visitedURLs map[string]bool // URLs visited for crawling
|
||||
checkedURLs map[string]Result // URLs already checked to avoid duplicate requests
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// Option is a function option for the Scraper
|
||||
type Option func(*Scraper)
|
||||
|
||||
// WithConcurrency sets the concurrency level
|
||||
func WithConcurrency(concurrency int) Option {
|
||||
return func(s *Scraper) {
|
||||
s.concurrency = concurrency
|
||||
}
|
||||
}
|
||||
|
||||
// WithDepth sets the maximum crawling depth
|
||||
func WithDepth(depth int) Option {
|
||||
return func(s *Scraper) {
|
||||
s.depth = depth
|
||||
}
|
||||
}
|
||||
|
||||
// WithTimeout sets the timeout for HTTP requests
|
||||
func WithTimeout(timeoutSec int) Option {
|
||||
return func(s *Scraper) {
|
||||
s.client.Timeout = time.Duration(timeoutSec) * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
// WithVerbose enables verbose output
|
||||
func WithVerbose(verbose bool) Option {
|
||||
return func(s *Scraper) {
|
||||
s.verbose = verbose
|
||||
}
|
||||
}
|
||||
|
||||
// WithInternalOnly sets whether to only check internal links
|
||||
func WithInternalOnly(internalOnly bool) Option {
|
||||
return func(s *Scraper) {
|
||||
s.internalOnly = internalOnly
|
||||
}
|
||||
}
|
||||
|
||||
// New creates a new Scraper with the given options
|
||||
func New(options ...Option) *Scraper {
|
||||
s := &Scraper{
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
concurrency: 10,
|
||||
depth: 3,
|
||||
visitedURLs: make(map[string]bool),
|
||||
checkedURLs: make(map[string]Result),
|
||||
}
|
||||
|
||||
for _, option := range options {
|
||||
option(s)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// Scan starts the website crawling process
|
||||
func (s *Scraper) Scan(baseURL string) (*Results, error) {
|
||||
parsedURL, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Ensure the base URL has a scheme
|
||||
if parsedURL.Scheme == "" {
|
||||
parsedURL.Scheme = "https"
|
||||
baseURL = parsedURL.String()
|
||||
}
|
||||
|
||||
// Store the base hostname for distinguishing internal vs external links
|
||||
baseHostname := parsedURL.Hostname()
|
||||
|
||||
results := &Results{
|
||||
BaseURL: baseURL,
|
||||
}
|
||||
|
||||
// Create a waitgroup to track active workers
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Create a channel to communicate URLs to process
|
||||
queue := make(chan QueueItem, 1000)
|
||||
|
||||
// Create a channel to track active URL processing
|
||||
activeCount := make(chan int, 1)
|
||||
activeCount <- 1 // Start with 1 active URL (the base URL)
|
||||
|
||||
// Start worker pool
|
||||
for range s.concurrency {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for item := range queue {
|
||||
s.processURL(item.URL, item.SourceURL, baseHostname, item.Depth, queue, results, activeCount)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Initial URL to process - the source is itself for the initial URL
|
||||
queue <- QueueItem{
|
||||
URL: baseURL,
|
||||
SourceURL: baseURL,
|
||||
Depth: 0,
|
||||
}
|
||||
|
||||
// Monitor active count - when it reaches 0, we're done
|
||||
go func() {
|
||||
for {
|
||||
count := <-activeCount
|
||||
if count <= 0 {
|
||||
close(queue)
|
||||
return
|
||||
}
|
||||
activeCount <- count
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for workers to finish
|
||||
wg.Wait()
|
||||
|
||||
results.Total = len(results.Errors) + len(results.Successes)
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// processURL processes a single URL
|
||||
func (s *Scraper) processURL(currentURL, sourceURL string, baseHostname string, depth int, queue chan<- QueueItem, results *Results, activeCount chan int) {
|
||||
// Decrement active count when done
|
||||
defer func() {
|
||||
count := <-activeCount
|
||||
activeCount <- count - 1
|
||||
}()
|
||||
|
||||
// Check if we've already visited this URL (for crawling) or exceeded max depth
|
||||
s.mu.Lock()
|
||||
if s.visitedURLs[currentURL] || depth > s.depth {
|
||||
s.mu.Unlock()
|
||||
return
|
||||
}
|
||||
s.visitedURLs[currentURL] = true
|
||||
|
||||
// If we've already checked this URL's status, reuse the result
|
||||
if result, exists := s.checkedURLs[currentURL]; exists {
|
||||
// Always use the provided source URL
|
||||
result.SourceURL = sourceURL
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
|
||||
// We still need to parse HTML/CSS content even if we've checked the URL before
|
||||
// But only if it was successful
|
||||
if result.Error == "" && result.Status < 400 {
|
||||
// Continue with content parsing...
|
||||
} else {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
if s.verbose {
|
||||
fmt.Printf("Checking: %s (depth: %d) [source: %s]\n", currentURL, depth, sourceURL)
|
||||
}
|
||||
|
||||
// Parse the current URL
|
||||
currentParsed, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: currentURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
Type: "link",
|
||||
IsExternal: false,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[currentURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
|
||||
// Determine if the URL is internal or external
|
||||
isExternal := currentParsed.Hostname() != baseHostname && currentParsed.Hostname() != ""
|
||||
|
||||
// Skip external links processing if internalOnly is set
|
||||
if isExternal && s.internalOnly {
|
||||
return
|
||||
}
|
||||
|
||||
// Process external links differently from internal links
|
||||
if isExternal {
|
||||
s.checkExternalURL(currentURL, sourceURL, results)
|
||||
return
|
||||
}
|
||||
|
||||
// Internal URL, check and crawl
|
||||
resp, err := s.client.Get(currentURL)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: currentURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: err.Error(),
|
||||
Type: "link",
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[currentURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Add the result
|
||||
var result Result
|
||||
if resp.StatusCode >= 400 {
|
||||
result = Result{
|
||||
URL: currentURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
|
||||
Type: "link",
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[currentURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
} else {
|
||||
result = Result{
|
||||
URL: currentURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Type: "link",
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[currentURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
}
|
||||
|
||||
// Only parse HTML and CSS from internal links
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if strings.Contains(contentType, "text/html") {
|
||||
// Use the base hostname to create a base URL for this site
|
||||
baseURL := ""
|
||||
if currentParsed.Scheme != "" && currentParsed.Host != "" {
|
||||
baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
|
||||
}
|
||||
|
||||
foundURLs := s.parseHTML(currentURL, resp, baseURL, baseHostname, depth+1, results)
|
||||
// Add all found URLs to the queue and increment active count
|
||||
if len(foundURLs) > 0 {
|
||||
count := <-activeCount
|
||||
count += len(foundURLs)
|
||||
activeCount <- count
|
||||
|
||||
for _, url := range foundURLs {
|
||||
queue <- QueueItem{
|
||||
URL: url,
|
||||
SourceURL: currentURL, // The source URL is the current page we're processing
|
||||
Depth: depth + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if strings.Contains(contentType, "text/css") {
|
||||
// Use the base hostname to create a base URL for this site
|
||||
baseURL := ""
|
||||
if currentParsed.Scheme != "" && currentParsed.Host != "" {
|
||||
baseURL = fmt.Sprintf("%s://%s", currentParsed.Scheme, currentParsed.Host)
|
||||
}
|
||||
|
||||
s.parseCSS(currentURL, resp, baseURL, baseHostname, results)
|
||||
}
|
||||
}
|
||||
|
||||
// parseHTML extracts links and other resources from HTML
|
||||
func (s *Scraper) parseHTML(sourceURL string, resp *http.Response, baseURL, baseHostname string, _ int, results *Results) []string {
|
||||
foundURLs := []string{}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
s.addResult(results, Result{
|
||||
URL: sourceURL,
|
||||
SourceURL: sourceURL, // Use self as source for error
|
||||
Error: fmt.Sprintf("Failed to parse HTML: %v", err),
|
||||
Status: resp.StatusCode,
|
||||
Type: "html",
|
||||
IsExternal: false,
|
||||
})
|
||||
return foundURLs
|
||||
}
|
||||
|
||||
// Process links (a href)
|
||||
doc.Find("a").Each(func(i int, sel *goquery.Selection) {
|
||||
if href, exists := sel.Attr("href"); exists {
|
||||
targetURL := s.resolveURL(href, sourceURL)
|
||||
if targetURL != "" {
|
||||
linkParsed, err := url.Parse(targetURL)
|
||||
if err == nil {
|
||||
isExternal := linkParsed.Hostname() != baseHostname && linkParsed.Hostname() != ""
|
||||
|
||||
// Only add internal links to the crawl queue
|
||||
if !isExternal {
|
||||
foundURLs = append(foundURLs, targetURL)
|
||||
} else if !s.internalOnly {
|
||||
// Check external links only if internalOnly is false
|
||||
s.checkExternalURL(targetURL, sourceURL, results)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Process images
|
||||
doc.Find("img").Each(func(i int, sel *goquery.Selection) {
|
||||
if src, exists := sel.Attr("src"); exists {
|
||||
targetURL := s.resolveURL(src, sourceURL)
|
||||
if targetURL != "" {
|
||||
s.checkResource(targetURL, sourceURL, "image", baseHostname, results)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Process stylesheets
|
||||
doc.Find("link[rel='stylesheet']").Each(func(i int, sel *goquery.Selection) {
|
||||
if href, exists := sel.Attr("href"); exists {
|
||||
targetURL := s.resolveURL(href, sourceURL)
|
||||
if targetURL != "" {
|
||||
s.checkResource(targetURL, sourceURL, "stylesheet", baseHostname, results)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Process scripts
|
||||
doc.Find("script").Each(func(i int, sel *goquery.Selection) {
|
||||
if src, exists := sel.Attr("src"); exists {
|
||||
targetURL := s.resolveURL(src, sourceURL)
|
||||
if targetURL != "" {
|
||||
s.checkResource(targetURL, sourceURL, "script", baseHostname, results)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return foundURLs
|
||||
}
|
||||
|
||||
// parseCSS extracts URLs from CSS content
|
||||
func (s *Scraper) parseCSS(sourceURL string, resp *http.Response, baseURL, baseHostname string, results *Results) {
|
||||
// Simple regex-based parsing for CSS imports and url() references
|
||||
// This is a simplified approach; a proper CSS parser would be better
|
||||
// for production use
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Look for url() references
|
||||
urlMatches := urlRegexp.FindAllStringSubmatch(line, -1)
|
||||
for _, match := range urlMatches {
|
||||
if len(match) > 1 {
|
||||
// Remove quotes if present
|
||||
urlValue := strings.Trim(match[1], "'\"")
|
||||
targetURL := s.resolveURL(urlValue, sourceURL)
|
||||
if targetURL != "" {
|
||||
s.checkResource(targetURL, sourceURL, "css-url", baseHostname, results)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Look for @import statements
|
||||
importMatches := importRegexp.FindAllStringSubmatch(line, -1)
|
||||
for _, match := range importMatches {
|
||||
if len(match) > 1 {
|
||||
// Remove quotes if present
|
||||
urlValue := strings.Trim(match[1], "'\"")
|
||||
targetURL := s.resolveURL(urlValue, sourceURL)
|
||||
if targetURL != "" {
|
||||
s.checkResource(targetURL, sourceURL, "css-import", baseHostname, results)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// resolveURL resolves a relative URL against a base URL
|
||||
func (s *Scraper) resolveURL(href, sourceURL string) string {
|
||||
// Skip empty URLs, anchors, and javascript
|
||||
if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Skip non-HTTP protocols like mailto:, tel:, etc.
|
||||
if strings.HasPrefix(href, "mailto:") ||
|
||||
strings.HasPrefix(href, "tel:") ||
|
||||
strings.HasPrefix(href, "sms:") ||
|
||||
strings.HasPrefix(href, "ftp:") ||
|
||||
strings.HasPrefix(href, "file:") {
|
||||
return ""
|
||||
}
|
||||
|
||||
sourceParsed, err := url.Parse(sourceURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
targetParsed, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// If the scheme is not HTTP/HTTPS, skip it
|
||||
if targetParsed.Scheme != "" &&
|
||||
targetParsed.Scheme != "http" &&
|
||||
targetParsed.Scheme != "https" {
|
||||
return ""
|
||||
}
|
||||
|
||||
resolvedURL := sourceParsed.ResolveReference(targetParsed).String()
|
||||
return resolvedURL
|
||||
}
|
||||
|
||||
// checkExternalURL performs a HEAD request to check external URLs
|
||||
func (s *Scraper) checkExternalURL(targetURL, sourceURL string, results *Results) {
|
||||
// Skip external links if internalOnly is set
|
||||
if s.internalOnly {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if URL was already checked
|
||||
s.mu.Lock()
|
||||
if result, exists := s.checkedURLs[targetURL]; exists {
|
||||
// Add the existing result with the current source URL
|
||||
result.SourceURL = sourceURL
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
req, err := http.NewRequest("HEAD", targetURL, nil)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
Type: "external-link",
|
||||
IsExternal: true,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: err.Error(),
|
||||
Type: "external-link",
|
||||
IsExternal: true,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result Result
|
||||
if resp.StatusCode >= 400 {
|
||||
result = Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
|
||||
Type: "external-link",
|
||||
IsExternal: true,
|
||||
}
|
||||
} else {
|
||||
result = Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Type: "external-link",
|
||||
IsExternal: true,
|
||||
}
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
}
|
||||
|
||||
// checkResource checks if a resource URL is accessible
|
||||
func (s *Scraper) checkResource(targetURL, sourceURL, resourceType, baseHostname string, results *Results) {
|
||||
// Parse the target URL to determine if it's internal or external
|
||||
targetParsed, err := url.Parse(targetURL)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
Type: resourceType,
|
||||
IsExternal: false,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
|
||||
isExternal := targetParsed.Hostname() != baseHostname && targetParsed.Hostname() != ""
|
||||
|
||||
// Skip external resources if internalOnly is set
|
||||
if isExternal && s.internalOnly {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if URL was already checked
|
||||
s.mu.Lock()
|
||||
if result, exists := s.checkedURLs[targetURL]; exists {
|
||||
// Update with current source and type if needed
|
||||
result.SourceURL = sourceURL
|
||||
result.Type = resourceType
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
req, err := http.NewRequest("HEAD", targetURL, nil)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
Type: resourceType,
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
result := Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Error: err.Error(),
|
||||
Type: resourceType,
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result Result
|
||||
if resp.StatusCode >= 400 {
|
||||
result = Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Error: fmt.Sprintf("HTTP Error: %s", resp.Status),
|
||||
Type: resourceType,
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
} else {
|
||||
result = Result{
|
||||
URL: targetURL,
|
||||
SourceURL: sourceURL,
|
||||
Status: resp.StatusCode,
|
||||
Type: resourceType,
|
||||
IsExternal: isExternal,
|
||||
}
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
s.checkedURLs[targetURL] = result
|
||||
s.mu.Unlock()
|
||||
s.addResult(results, result)
|
||||
}
|
||||
|
||||
// addResult adds a result to the appropriate list
|
||||
func (s *Scraper) addResult(results *Results, result Result) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if result.Error != "" {
|
||||
results.Errors = append(results.Errors, result)
|
||||
} else {
|
||||
results.Successes = append(results.Successes, result)
|
||||
}
|
||||
}
|
163
pkg/scraper/scraper_test.go
Normal file
163
pkg/scraper/scraper_test.go
Normal file
|
@ -0,0 +1,163 @@
|
|||
package scraper
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestResolveURL(t *testing.T) {
|
||||
s := &Scraper{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
href string
|
||||
sourceURL string
|
||||
baseURL string
|
||||
wantResult string
|
||||
}{
|
||||
{
|
||||
name: "Absolute URL",
|
||||
href: "https://example.com/page.html",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "https://example.com/page.html",
|
||||
},
|
||||
{
|
||||
name: "Relative URL",
|
||||
href: "page.html",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "https://example.org/page.html",
|
||||
},
|
||||
{
|
||||
name: "Anchor link",
|
||||
href: "#section",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "",
|
||||
},
|
||||
{
|
||||
name: "JavaScript link",
|
||||
href: "javascript:void(0)",
|
||||
sourceURL: "https://example.org/index.html",
|
||||
baseURL: "https://example.org/",
|
||||
wantResult: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := s.resolveURL(tt.href, tt.sourceURL)
|
||||
if got != tt.wantResult {
|
||||
t.Errorf("resolveURL() = %v, want %v", got, tt.wantResult)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
s := New(
|
||||
WithConcurrency(20),
|
||||
WithDepth(5),
|
||||
WithTimeout(30),
|
||||
WithVerbose(true),
|
||||
)
|
||||
|
||||
if s.concurrency != 20 {
|
||||
t.Errorf("Expected concurrency to be 20, got %d", s.concurrency)
|
||||
}
|
||||
|
||||
if s.depth != 5 {
|
||||
t.Errorf("Expected depth to be 5, got %d", s.depth)
|
||||
}
|
||||
|
||||
if s.client.Timeout != 30*1000*1000*1000 { // 30 seconds in nanoseconds
|
||||
t.Errorf("Expected timeout to be 30s, got %v", s.client.Timeout)
|
||||
}
|
||||
|
||||
if !s.verbose {
|
||||
t.Errorf("Expected verbose to be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddResult(t *testing.T) {
|
||||
s := &Scraper{}
|
||||
results := &Results{}
|
||||
|
||||
// Add an error result
|
||||
errorResult := Result{
|
||||
URL: "https://example.com/error",
|
||||
Error: "Test error",
|
||||
Type: "link",
|
||||
}
|
||||
s.addResult(results, errorResult)
|
||||
|
||||
if len(results.Errors) != 1 {
|
||||
t.Errorf("Expected 1 error, got %d", len(results.Errors))
|
||||
}
|
||||
|
||||
// Add a success result
|
||||
successResult := Result{
|
||||
URL: "https://example.com/success",
|
||||
Status: 200,
|
||||
Type: "link",
|
||||
}
|
||||
s.addResult(results, successResult)
|
||||
|
||||
if len(results.Successes) != 1 {
|
||||
t.Errorf("Expected 1 success, got %d", len(results.Successes))
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessURL(t *testing.T) {
|
||||
// Create a test server
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
if _, err := w.Write([]byte(`
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" href="/style.css">
|
||||
<script src="/script.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<a href="/page1.html">Page 1</a>
|
||||
<a href="https://example.com">External</a>
|
||||
<img src="/image.jpg">
|
||||
</body>
|
||||
</html>
|
||||
`)); err != nil {
|
||||
t.Fatalf("Failed to write response: %v", err)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
s := New(WithDepth(1), WithConcurrency(1))
|
||||
results := &Results{}
|
||||
|
||||
// Create a channel for QueueItems instead of strings
|
||||
queue := make(chan QueueItem, 10)
|
||||
|
||||
// Create active count channel
|
||||
activeCount := make(chan int, 1)
|
||||
activeCount <- 1 // Start with one active URL
|
||||
|
||||
// Parse the server URL to get the hostname
|
||||
serverURL, _ := url.Parse(server.URL)
|
||||
baseHostname := serverURL.Hostname()
|
||||
|
||||
// Process the URL with the updated signature
|
||||
s.processURL(server.URL, server.URL, baseHostname, 0, queue, results, activeCount)
|
||||
|
||||
// Check that we found at least one success (the main page)
|
||||
if len(results.Successes) < 1 {
|
||||
t.Errorf("Expected at least 1 success, got %d", len(results.Successes))
|
||||
}
|
||||
|
||||
// Check that we queued some URLs for processing
|
||||
if len(queue) < 1 {
|
||||
t.Errorf("Expected at least 1 URL in queue, got %d", len(queue))
|
||||
}
|
||||
}
|
206
pkg/scraper/testsite_test.go
Normal file
206
pkg/scraper/testsite_test.go
Normal file
|
@ -0,0 +1,206 @@
|
|||
package scraper
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.nakama.town/fmartingr/dharma/pkg/testutil"
|
||||
)
|
||||
|
||||
func TestTestsiteIntegration(t *testing.T) {
|
||||
// Skip if running short tests
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping integration test in short mode")
|
||||
}
|
||||
|
||||
// Start the testsite server
|
||||
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to start test server: %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Create a new scraper with default settings
|
||||
s := New(
|
||||
WithConcurrency(2),
|
||||
WithDepth(3),
|
||||
WithTimeout(5),
|
||||
WithVerbose(false),
|
||||
WithInternalOnly(true),
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(serverURL)
|
||||
if err != nil {
|
||||
t.Fatalf("Scraper.Scan failed: %v", err)
|
||||
}
|
||||
|
||||
// Verify we have results
|
||||
if results == nil {
|
||||
t.Fatal("Expected results but got nil")
|
||||
}
|
||||
|
||||
// Check that we have the correct base URL
|
||||
if results.BaseURL != serverURL {
|
||||
t.Errorf("Expected BaseURL to be %s, got %s", serverURL, results.BaseURL)
|
||||
}
|
||||
|
||||
// Test cases for specific URLs that should be found
|
||||
expectedURLs := map[string]bool{
|
||||
"/found.html": true,
|
||||
"/not_found.html": false,
|
||||
"/rel/index.html": true,
|
||||
"/rel/relfound.html": true,
|
||||
"/static/style.css": true,
|
||||
"/static/script.js": true,
|
||||
}
|
||||
|
||||
// Check for expected URLs in the results
|
||||
for urlPath, shouldExist := range expectedURLs {
|
||||
fullURL := serverURL + urlPath
|
||||
found := false
|
||||
|
||||
// Look in both successes and errors
|
||||
for _, result := range results.Successes {
|
||||
if result.URL == fullURL {
|
||||
found = true
|
||||
if !shouldExist {
|
||||
t.Errorf("URL %s should not exist but was found in successes", urlPath)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found && shouldExist {
|
||||
// If not found in successes, check if it's in errors
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == fullURL {
|
||||
found = true
|
||||
t.Errorf("URL %s should exist but was found in errors", urlPath)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
t.Errorf("Expected URL %s was not found in results", urlPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that not_found.html is in errors
|
||||
notFoundURL := serverURL + "/not_found.html"
|
||||
foundInErrors := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == notFoundURL {
|
||||
foundInErrors = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundInErrors {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", notFoundURL)
|
||||
}
|
||||
|
||||
// Verify relative links in the rel directory
|
||||
relNotFoundURL := serverURL + "/rel/rel_not_found.html"
|
||||
foundRelNotFound := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == relNotFoundURL {
|
||||
foundRelNotFound = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundRelNotFound {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", relNotFoundURL)
|
||||
}
|
||||
|
||||
// Check for missing image
|
||||
missingImageURL := serverURL + "/rel/image-404.jpg"
|
||||
foundMissingImage := false
|
||||
for _, result := range results.Errors {
|
||||
if result.URL == missingImageURL {
|
||||
foundMissingImage = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !foundMissingImage {
|
||||
t.Errorf("Expected %s to be in errors but it wasn't", missingImageURL)
|
||||
}
|
||||
|
||||
// Check for external links
|
||||
externalLinkCount := 0
|
||||
for _, result := range results.Successes {
|
||||
if strings.Contains(result.URL, "fmartingr.com") {
|
||||
externalLinkCount++
|
||||
}
|
||||
}
|
||||
|
||||
if externalLinkCount != 0 {
|
||||
t.Errorf("Found %d external links but should be 0 with internalOnly=true", externalLinkCount)
|
||||
}
|
||||
|
||||
// Verify total count
|
||||
expectedTotal := len(results.Successes) + len(results.Errors)
|
||||
if results.Total != expectedTotal {
|
||||
t.Errorf("Expected Total to be %d, got %d", expectedTotal, results.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTestsiteWithExternalLinks(t *testing.T) {
|
||||
// Skip if running short tests
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping integration test in short mode")
|
||||
}
|
||||
|
||||
// Start the testsite server
|
||||
serverURL, cleanup, err := testutil.StartTestsiteServer()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to start test server: %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Create a new scraper with external links allowed
|
||||
s := New(
|
||||
WithConcurrency(2),
|
||||
WithDepth(1), // Lower depth for external links test
|
||||
WithTimeout(5),
|
||||
WithVerbose(false),
|
||||
WithInternalOnly(false), // Allow external links
|
||||
)
|
||||
|
||||
// Run the scraper
|
||||
results, err := s.Scan(serverURL)
|
||||
if err != nil {
|
||||
t.Fatalf("Scraper.Scan failed: %v", err)
|
||||
}
|
||||
|
||||
// Check for external links - we should find at least one external link
|
||||
foundExternalLinks := false
|
||||
brokenExternalLinks := false
|
||||
|
||||
for _, result := range results.Successes {
|
||||
if result.IsExternal && strings.Contains(result.URL, "fmartingr.com") {
|
||||
foundExternalLinks = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
for _, result := range results.Errors {
|
||||
if result.IsExternal && strings.Contains(result.URL, "e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi") {
|
||||
brokenExternalLinks = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// We don't actually hit external URLs in tests, so we can't assert on them,
|
||||
// but we can check that they're properly identified
|
||||
if !foundExternalLinks {
|
||||
t.Log("No successful external links found in test - this is expected in CI/isolated test environments")
|
||||
}
|
||||
|
||||
if !brokenExternalLinks {
|
||||
t.Log("No broken external links found in test - this is expected in CI/isolated test environments")
|
||||
}
|
||||
}
|
79
pkg/testutil/testserver.go
Normal file
79
pkg/testutil/testserver.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
package testutil
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// StartTestsiteServer starts an HTTP server for the testsite directory
|
||||
// and returns the server URL and a cleanup function
|
||||
func StartTestsiteServer() (string, func(), error) {
|
||||
// Determine the absolute path to the testsite directory
|
||||
wd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("failed to get working directory: %v", err)
|
||||
}
|
||||
|
||||
// Navigate up to find the project root
|
||||
// This assumes the function is called from somewhere within the project
|
||||
projectRoot := wd
|
||||
for {
|
||||
if _, err := os.Stat(filepath.Join(projectRoot, "testsite")); err == nil {
|
||||
break // Found the testsite directory
|
||||
}
|
||||
parent := filepath.Dir(projectRoot)
|
||||
if parent == projectRoot {
|
||||
return "", nil, fmt.Errorf("testsite directory not found")
|
||||
}
|
||||
projectRoot = parent
|
||||
}
|
||||
|
||||
testsitePath := filepath.Join(projectRoot, "testsite")
|
||||
if _, err := os.Stat(testsitePath); err != nil {
|
||||
return "", nil, fmt.Errorf("testsite directory not found: %v", err)
|
||||
}
|
||||
|
||||
// Create a file server for the testsite directory
|
||||
fileServer := http.FileServer(http.Dir(testsitePath))
|
||||
|
||||
// Create a test server to serve the testsite files
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fileServer.ServeHTTP(w, r)
|
||||
}))
|
||||
|
||||
// Return the server URL and a cleanup function
|
||||
return server.URL, server.Close, nil
|
||||
}
|
||||
|
||||
// GetTestsitePath returns the absolute path to the testsite directory
|
||||
func GetTestsitePath() (string, error) {
|
||||
// Determine the absolute path to the testsite directory
|
||||
wd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get working directory: %v", err)
|
||||
}
|
||||
|
||||
// Navigate up to find the project root
|
||||
// This assumes the function is called from somewhere within the project
|
||||
projectRoot := wd
|
||||
for {
|
||||
if _, err := os.Stat(filepath.Join(projectRoot, "testsite")); err == nil {
|
||||
break // Found the testsite directory
|
||||
}
|
||||
parent := filepath.Dir(projectRoot)
|
||||
if parent == projectRoot {
|
||||
return "", fmt.Errorf("testsite directory not found")
|
||||
}
|
||||
projectRoot = parent
|
||||
}
|
||||
|
||||
testsitePath := filepath.Join(projectRoot, "testsite")
|
||||
if _, err := os.Stat(testsitePath); err != nil {
|
||||
return "", fmt.Errorf("testsite directory not found: %v", err)
|
||||
}
|
||||
|
||||
return testsitePath, nil
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue