diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..fa6f24d --- /dev/null +++ b/go.mod @@ -0,0 +1,22 @@ +module github.com/fmartingr/dharma + +go 1.17 + +require github.com/gocolly/colly v1.2.0 + +require ( + github.com/PuerkitoBio/goquery v1.8.0 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.2.4 // indirect + github.com/antchfx/xmlquery v1.3.9 // indirect + github.com/antchfx/xpath v1.2.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect + github.com/golang/protobuf v1.3.1 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect + golang.org/x/text v0.3.7 // indirect + google.golang.org/appengine v1.6.7 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..9333b51 --- /dev/null +++ b/go.sum @@ -0,0 +1,57 @@ +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494= +github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc= +github.com/antchfx/xmlquery v1.3.9 h1:Y+zyMdiUZ4fasTQTkDb3DflOXP7+obcYEh80SISBmnQ= +github.com/antchfx/xmlquery v1.3.9/go.mod h1:wojC/BxjEkjJt6dPiAqUzoXO5nIMWtxHS8PD8TmN4ks= +github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8= +github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= diff --git a/main.go b/main.go new file mode 100644 index 0000000..d23ba50 --- /dev/null +++ b/main.go @@ -0,0 +1,157 @@ +package main + +import ( + "log" + "net/url" + "strings" + "sync" + "time" + + "github.com/gocolly/colly" +) + +// https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77 +// RemoveAsciiTabAndNewlines removes the corresponding characters +// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser. +// Although step 2 says "validation error", this is not a hard error, +// and browsers do in fact just silently remove those. +// +// This function is mostly used internally, but it's exported for extra +// convenience. +func RemoveAsciiTabAndNewlines(s string) string { + return strings.Map(func(r rune) rune { + switch r { + case '\t', '\n', '\r': + return -1 + default: + return r + } + }, s) +} + +func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) { + href := RemoveAsciiTabAndNewlines(e.Attr(attr)) + if !strings.HasPrefix(href, "http") { + href = e.Request.AbsoluteURL(href) + } + u, err := url.Parse(href) + if err != nil { + log.Printf("in: %s", e.Request.URL.String()) + log.Printf("error parsing %s: %s", attr, err) + return nil, err + } + + return u, nil +} + +type References struct { + urls map[string]map[string]*url.URL + urlMu sync.RWMutex +} + +func (r *References) From(refLink string) (result []*url.URL) { + r.urlMu.RLock() + defer r.urlMu.RUnlock() + + ref, exists := r.urls[refLink] + if !exists { + return + } + + for _, v := range ref { + result = append(result, v) + } + + return +} + +func (r *References) Register(base *url.URL, refLink *url.URL) error { + r.urlMu.Lock() + defer r.urlMu.Unlock() + + ref, exists := r.urls[refLink.String()] + if !exists { + r.urls[refLink.String()] = make(map[string]*url.URL) + r.urls[refLink.String()][base.String()] = base + } else { + ref[base.String()] = base + } + return nil +} + +func NewReferences() *References { + return &References{ + urls: make(map[string]map[string]*url.URL), + urlMu: sync.RWMutex{}, + } +} + +const hostname = "fmartingr.com" + +func main() { + errors := make(map[string]int) + errorsMu := sync.RWMutex{} + references := NewReferences() + + c := colly.NewCollector( + // colly.AllowedDomains(hostname), + // colly.Async(true), + ) + + c.OnHTML("[href]", func(e *colly.HTMLElement) { + if e.Request.URL.Host != hostname { + return + } + + href, err := parseAttr(e, "href") + if err != nil { + return + } + + references.Register(e.Request.URL, href) + + e.Request.Visit(e.Attr("href")) + }) + + c.OnHTML("[src]", func(e *colly.HTMLElement) { + if e.Request.URL.Host != hostname { + return + } + + href, err := parseAttr(e, "src") + if err != nil { + return + } + references.Register(e.Request.URL, href) + + e.Request.Visit(e.Attr("src")) + }) + + c.OnResponse(func(r *colly.Response) { + // log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode) + }) + + c.OnError(func(r *colly.Response, e error) { + log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e) + errorsMu.Lock() + errors[r.Request.URL.String()] = r.StatusCode + errorsMu.Unlock() + }) + + c.Limit(&colly.LimitRule{Parallelism: 2}) + c.SetRequestTimeout(10 * time.Second) + + c.Visit("http://" + hostname) + + c.Wait() + + for errUrl, statusCode := range errors { + log.Printf("[%d] %s", statusCode, errUrl) + + parsedURL, _ := url.Parse(errUrl) + log.Println(" Found in:") + for _, r := range references.From(parsedURL.String()) { + log.Printf(" - %s", r) + } + } +} diff --git a/testsite/found.html b/testsite/found.html new file mode 100644 index 0000000..4d3bb1d --- /dev/null +++ b/testsite/found.html @@ -0,0 +1 @@ +found diff --git a/testsite/index.html b/testsite/index.html new file mode 100644 index 0000000..4cf9abe --- /dev/null +++ b/testsite/index.html @@ -0,0 +1,17 @@ + + + + + + + + Document + + + + Found.html
+ Found.html
+ + + +