-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathcollectlinks.go
74 lines (68 loc) · 1.6 KB
/
collectlinks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Package collectlinks does extraordinarily simple operation of parsing a given piece of html
// and providing you with all the hyperlinks hrefs it finds.
package collectlinks
import (
"io"
"strconv"
"strings"
"golang.org/x/net/html"
)
// All takes a reader object (like the one returned from http.Get())
// It returns a slice of strings representing the "href" attributes from
// anchor links found in the provided html.
// It does not close the reader passed to it.
func All(httpBody io.Reader) []string {
links := []string{}
col := []string{}
page := html.NewTokenizer(httpBody)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
return links
}
token := page.Token()
if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
tl := trimHash(attr.Val)
col = append(col, tl)
resolv(&links, col)
}
}
}
}
}
// trimHash slices a hash # from the link
func trimHash(l string) string {
if strings.Contains(l, "#") {
var index int
for n, str := range l {
if strconv.QuoteRune(str) == "'#'" {
index = n
break
}
}
return l[:index]
}
return l
}
// check looks to see if a url exits in the slice.
func check(sl []string, s string) bool {
var check bool
for _, str := range sl {
if str == s {
check = true
break
}
}
return check
}
// resolv adds links to the link slice and insures that there is no repetition
// in our collection.
func resolv(sl *[]string, ml []string) {
for _, str := range ml {
if check(*sl, str) == false {
*sl = append(*sl, str)
}
}
}