-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchecker.go
174 lines (139 loc) · 3.51 KB
/
checker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
package linkschecker
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
parser "github.com/Eslam-Nawara/linkschecker/pkg/tomlparser"
"golang.org/x/net/html"
)
type mapChanels struct {
visitLink chan string
linkState chan bool
}
// Parse the toml file, extract an array of links and start checking the links concurrently
func CheckLinksInFile(configFile string) error {
links, err := parser.LinksFromConfig(configFile)
if err != nil {
return err
}
mp := manageLinksMap()
sync := make(chan bool)
go mp.checkArrayOfLinks(links, "", sync)
<-sync
return nil
}
// Starts a go routin that manages reading from and writing to the list
// of checked links that is shared among all go routins.
func manageLinksMap() mapChanels {
ch := mapChanels{}
ch.visitLink = make(chan string)
ch.linkState = make(chan bool)
visitedLinks := make(map[string]bool)
go func() {
for {
link := <-ch.visitLink
isVisited := visitedLinks[link]
if !isVisited {
visitedLinks[link] = true
}
ch.linkState <- isVisited
}
}()
return ch
}
// Checks the health of an array of links
func (mp mapChanels) checkArrayOfLinks(links []string, parent string, parentChan chan bool) {
cnt := 0
childChan := make(chan bool)
for _, link := range links {
mp.visitLink <- link
if ok := <-mp.linkState; !ok {
tempLink := fmt.Sprintf("%s/%s", getHostname(parent), strings.Trim(link, "/"))
if validateLink(tempLink) {
innerLinks := visitLinkAndExtractLinks(tempLink)
cnt++
go mp.checkArrayOfLinks(innerLinks, tempLink, childChan)
} else {
if getHostname(link) == getHostname(parent) || parent == "" {
if validateLink(link) {
innerLinks := visitLinkAndExtractLinks(link)
cnt++
go mp.checkArrayOfLinks(innerLinks, link, childChan)
} else {
fmt.Println(link)
}
} else if !validateLink(link) {
fmt.Println(link)
}
}
}
}
for i := 0; i < cnt; i++ {
<-childChan
}
parentChan <- true
}
// Validate the link by sending a Head request or a Get request.
func validateLink(link string) bool {
link = ensureScheme(link)
var requestFun func(fn func(string) (*http.Response, error)) bool
requestFun = func(fn func(string) (*http.Response, error)) bool {
resp, err := fn(link)
if err != nil {
return false
}
defer resp.Body.Close()
statusCode := resp.StatusCode
return (statusCode >= 200 && statusCode < 400)
}
return requestFun(http.Head) || requestFun(http.Get)
}
// Extract links all links in a web page.
func visitLinkAndExtractLinks(link string) []string {
link = ensureScheme(link)
resp, err := http.Get(link)
if err != nil {
return nil
}
body := resp.Body
defer body.Close()
return extractLinksFromIOReader(body)
}
func ensureScheme(link string) string {
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
link = fmt.Sprintf("https://%s", link)
}
return link
}
// Extract the links from the page body
func extractLinksFromIOReader(body io.ReadCloser) []string {
var links []string
z := html.NewTokenizer(body)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
return links
case html.StartTagToken, html.EndTagToken:
token := z.Token()
if "a" == token.Data {
for _, attr := range token.Attr {
if attr.Key == "href" {
links = append(links, attr.Val)
}
}
}
}
}
}
// Extract Hostname from a url
func getHostname(link string) string {
link = ensureScheme(link)
url, err := url.Parse(link)
if err != nil {
return ""
}
return strings.Trim(url.Hostname(), "/")
}