wip
This commit is contained in:
parent
9197dd2371
commit
6a2ce2daeb
5 changed files with 254 additions and 0 deletions
157
main.go
Normal file
157
main.go
Normal file
|
@ -0,0 +1,157 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77
|
||||
// RemoveAsciiTabAndNewlines removes the corresponding characters
|
||||
// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser.
|
||||
// Although step 2 says "validation error", this is not a hard error,
|
||||
// and browsers do in fact just silently remove those.
|
||||
//
|
||||
// This function is mostly used internally, but it's exported for extra
|
||||
// convenience.
|
||||
func RemoveAsciiTabAndNewlines(s string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
switch r {
|
||||
case '\t', '\n', '\r':
|
||||
return -1
|
||||
default:
|
||||
return r
|
||||
}
|
||||
}, s)
|
||||
}
|
||||
|
||||
func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) {
|
||||
href := RemoveAsciiTabAndNewlines(e.Attr(attr))
|
||||
if !strings.HasPrefix(href, "http") {
|
||||
href = e.Request.AbsoluteURL(href)
|
||||
}
|
||||
u, err := url.Parse(href)
|
||||
if err != nil {
|
||||
log.Printf("in: %s", e.Request.URL.String())
|
||||
log.Printf("error parsing %s: %s", attr, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return u, nil
|
||||
}
|
||||
|
||||
type References struct {
|
||||
urls map[string]map[string]*url.URL
|
||||
urlMu sync.RWMutex
|
||||
}
|
||||
|
||||
func (r *References) From(refLink string) (result []*url.URL) {
|
||||
r.urlMu.RLock()
|
||||
defer r.urlMu.RUnlock()
|
||||
|
||||
ref, exists := r.urls[refLink]
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
for _, v := range ref {
|
||||
result = append(result, v)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (r *References) Register(base *url.URL, refLink *url.URL) error {
|
||||
r.urlMu.Lock()
|
||||
defer r.urlMu.Unlock()
|
||||
|
||||
ref, exists := r.urls[refLink.String()]
|
||||
if !exists {
|
||||
r.urls[refLink.String()] = make(map[string]*url.URL)
|
||||
r.urls[refLink.String()][base.String()] = base
|
||||
} else {
|
||||
ref[base.String()] = base
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewReferences() *References {
|
||||
return &References{
|
||||
urls: make(map[string]map[string]*url.URL),
|
||||
urlMu: sync.RWMutex{},
|
||||
}
|
||||
}
|
||||
|
||||
const hostname = "fmartingr.com"
|
||||
|
||||
func main() {
|
||||
errors := make(map[string]int)
|
||||
errorsMu := sync.RWMutex{}
|
||||
references := NewReferences()
|
||||
|
||||
c := colly.NewCollector(
|
||||
// colly.AllowedDomains(hostname),
|
||||
// colly.Async(true),
|
||||
)
|
||||
|
||||
c.OnHTML("[href]", func(e *colly.HTMLElement) {
|
||||
if e.Request.URL.Host != hostname {
|
||||
return
|
||||
}
|
||||
|
||||
href, err := parseAttr(e, "href")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
references.Register(e.Request.URL, href)
|
||||
|
||||
e.Request.Visit(e.Attr("href"))
|
||||
})
|
||||
|
||||
c.OnHTML("[src]", func(e *colly.HTMLElement) {
|
||||
if e.Request.URL.Host != hostname {
|
||||
return
|
||||
}
|
||||
|
||||
href, err := parseAttr(e, "src")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
references.Register(e.Request.URL, href)
|
||||
|
||||
e.Request.Visit(e.Attr("src"))
|
||||
})
|
||||
|
||||
c.OnResponse(func(r *colly.Response) {
|
||||
// log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode)
|
||||
})
|
||||
|
||||
c.OnError(func(r *colly.Response, e error) {
|
||||
log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e)
|
||||
errorsMu.Lock()
|
||||
errors[r.Request.URL.String()] = r.StatusCode
|
||||
errorsMu.Unlock()
|
||||
})
|
||||
|
||||
c.Limit(&colly.LimitRule{Parallelism: 2})
|
||||
c.SetRequestTimeout(10 * time.Second)
|
||||
|
||||
c.Visit("http://" + hostname)
|
||||
|
||||
c.Wait()
|
||||
|
||||
for errUrl, statusCode := range errors {
|
||||
log.Printf("[%d] %s", statusCode, errUrl)
|
||||
|
||||
parsedURL, _ := url.Parse(errUrl)
|
||||
log.Println(" Found in:")
|
||||
for _, r := range references.From(parsedURL.String()) {
|
||||
log.Printf(" - %s", r)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue