-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.go
192 lines (169 loc) · 4.57 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// "THE BEER-WARE LICENSE" (Revision 42):
// <[email protected]> wrote this file. As long as you retain this notice
// you can do whatever you want with this stuff. If we meet some day, and you
// think this stuff is worth it, you can buy me a beer in return.
// Tobias Rehbein
// grawler is a gopherspace crawler written in the Go programming language.
//
// By defaults it starts crawling the gopherspace starting from
// gopher.floodgap.com and maps relations between servers.
//
// It generates a graph description suitable for postprocessing by the graphviz
// visualization toolkit.
//
// There are some commandline flags with sensible defaults available. Try the
// -h flag to get a list of these flags.
package main
import (
"flag"
"fmt"
"log"
"os"
"runtime"
"strings"
"sync"
"time"
"github.com/blabber/grawler/internal/grawler"
)
// blacklist some selectors. Any selector containing one of these substrings
// will not be crawled. These selectors tend to belong to "interactive" games,
// yielding endless crawls.
var blacklist = []string{
".run*",
".cgi?",
}
// crawledJob is used in the function main to communicate the finished job and
// a crawlerID identifying the ResourceCrawler that finished the job through
// the done channel.
type crawledJob struct {
crawlerID int
job *grawler.Resource
}
// mustCreateFile creates a file named name and panics if the creation fails.
func mustCreateFile(name string) *os.File {
f, err := os.Create(name)
if err != nil {
panic(err)
}
return f
}
func main() {
// Parse flags
flagBootstrap := flag.String("bootstrap", "gopher.floodgap.com", "the first server to crawl")
flagPort := flag.String("port", "70", "the listening port of the first server to crawl")
flagCrawlers := flag.Int("crawlers", runtime.NumCPU(), "the number of crawlers to run concurrently")
flagDotfile := flag.String("dotfile", "grawler.dot", "the output file")
flagLogfile := flag.String("logfile", "", "the log file (empty for stderr)")
flagItemsLogfile := flag.String("ilogfile", "", "the log file for items (\"-\" for stdout), empty to disable item logging")
flag.Parse()
// Setup logging
if *flagLogfile != "" {
log.SetOutput(mustCreateFile(*flagLogfile))
}
// Setup item log
var itemActions []grawler.ItemActionFunc
if *flagItemsLogfile != "" {
f := os.Stdout
if *flagItemsLogfile != "-" {
f = mustCreateFile(*flagItemsLogfile)
}
var mtx sync.Mutex
itemActions = append(itemActions, func(r grawler.Resource) {
mtx.Lock()
defer mtx.Unlock()
s, err := r.TryString()
if err != nil {
log.Printf("[ia] ERR: %v", err)
return
}
fmt.Fprintf(f, "%s\n", s)
})
}
// Create Coordinator
coord := grawler.NewCoordinator()
// Initialize Grapher
grapher, err := grawler.NewGrapher(mustCreateFile(*flagDotfile))
if err != nil {
panic(err)
}
defer func() {
err := grapher.Close()
if err != nil {
panic(err)
}
}()
// Create channels and seed crawlers.
done := make(chan *crawledJob)
findings := make(chan *grawler.CrawlFinding)
idleCrawlers := make(chan int, *flagCrawlers)
for i := 0; i < *flagCrawlers; i++ {
idleCrawlers <- i + 1
}
ticks := time.Tick(time.Minute)
// Bootstrap the crawling.
go func() {
h := *flagBootstrap
p := *flagPort
findings <- &grawler.CrawlFinding{
Resource: &grawler.Resource{
Host: &grawler.Host{
Hostname: h,
Port: p,
},
Type: grawler.DirectoryType,
Selector: "",
},
Parent: nil}
}()
// Enter the main loop.
for {
select {
case i := <-idleCrawlers:
j := coord.QueuedJob()
go func() {
defer func() {
done <- &crawledJob{crawlerID: i, job: j}
}()
if j == nil {
return
}
log.Printf("[%d] Crawling %v", i, j)
err := grawler.ResourceCrawler(grawler.NetResourceOpener, j, findings, itemActions...)
if err != nil {
log.Printf("[%d] ERR: %v", i, err)
}
log.Printf("[%d] Done crawling %v", i, j)
}()
case f := <-findings:
blacklisted := false
for _, b := range blacklist {
if strings.Contains(f.Resource.Selector, b) {
blacklisted = true
break
}
}
if blacklisted {
log.Printf("Blacklisted: %q", f.Resource.Selector)
break
}
err := coord.QueueJob(f.Resource)
if err != nil {
log.Print(err)
}
err = grapher.GraphFinding(f)
if err != nil {
panic(err)
}
case j := <-done:
if j.job != nil {
coord.FinishJob(j.job)
}
idleCrawlers <- j.crawlerID
case <-ticks:
log.Printf("STATUS: %s", coord.String())
}
if coord.JobsExhausted() {
break
}
}
}