-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparsing.go
92 lines (75 loc) · 1.53 KB
/
parsing.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package kraaler
import (
"bytes"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
func mimeIsHTML(mime string) bool {
return strings.HasPrefix(mime, "text/html")
}
func matcherByRegexp(s string, strs ...string) (func(string) bool, error) {
rgx, err := regexp.Compile(s)
if err != nil {
return nil, err
}
rgxps := []*regexp.Regexp{rgx}
for _, s := range strs {
rgx, err := regexp.Compile(s)
if err != nil {
return nil, err
}
rgxps = append(rgxps, rgx)
}
return func(s string) bool {
for _, rgx := range rgxps {
if ok := rgx.MatchString(s); ok {
return true
}
}
return false
}, nil
}
func RetrieveLinks(host *url.URL, body []byte) ([]*url.URL, error) {
kind := http.DetectContentType(body)
m, err := matcherByRegexp("^/[a-zA-Z]+", "^http://", "^https://")
if err != nil {
return nil, err
}
urls := map[string]struct{}{}
switch {
case mimeIsHTML(kind):
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, err
}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, ok := s.Attr("href")
if !ok {
return
}
if m(href) {
urls[href] = struct{}{}
}
})
}
var res []*url.URL
for u, _ := range urls {
link, err := url.Parse(u)
if err != nil {
continue
}
if link.Host == "" {
// cannot replace source with anything meaningful
if host.Host == "" {
continue
}
link.Host = host.Host
link.Scheme = host.Scheme
}
res = append(res, link)
}
return res, nil
}