-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcli.py
479 lines (448 loc) · 15 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
"""
Author: lt
Date: 2019-11-05
Desc: 终端执行入口
"""
import asyncio
import ctypes
import optparse
import os
import re
import time
import traceback
from datetime import date
from loguru import logger
from multiprocessing import Pool
from multiprocessing import Queue
from queue import Empty
import common as cm
import config as conf
from daemon import Daemon
from entrance import load_pipelines
from entrance import load_spiders
from _spider import DEFAULT_SPIDER_NAME
message_queue = Queue()
urls_queue = Queue()
# os.chdir(sys.path[0])
script_path = os.path.dirname(os.path.abspath(__file__))
DEFAULT_PROCESSES = 8
def parse_args():
"""解析命令行输入"""
usage = 'Usage: \n\tpython get_archival_info.py \n\t[-i] <input_path> \n\t[-o] ' \
'<output dir> \n\t[-c] <concurrent limit> \n\t[-p] ' \
'<processes num> \n\t[-d] <max depth each site>' \
'\n\t[-D] <run as daemon> \n\t[-u] <choose one or more urls to crawl> ' \
'\n\t[-L] <crawl level> \n\t[-S] <whether to use splash> ' \
'\n\t[-P] <whether to use proxy pool> \n\t[-B] \n\t[start] | restart | stop'
parser = optparse.OptionParser(usage=usage)
input_help_str = 'The path of file which contains ' \
'the list of sites to be crawled'
parser.add_option(
'-i', '--input',
type='str',
dest='input',
help=input_help_str
)
output_help_str = 'The base output dir that stores all of the crawled results'
parser.add_option(
'-o', '--output',
type='str',
dest='output',
help=output_help_str
)
concurrent_help_str = 'Limit the number of concurrency of a single process'
parser.add_option(
'-c', '--concurrent',
type='int',
dest='concurrent',
help=concurrent_help_str
)
process_help_str = 'Number of processes started.' \
'It only makes sense when you crawl multiple urls'
parser.add_option(
'-p', '--processes',
type='int',
dest='processes',
help=process_help_str
)
depth_help_str = 'Limit the max depth of pages while crawling'
parser.add_option(
'-d', '--depth',
type='int',
dest='depth',
help=depth_help_str
)
parser.add_option(
'-D', '--daemon',
action='store_true',
dest='daemon'
)
url_help_str = 'You can input one or more urls to crawl, ' \
'multiple urls need to be separated by commas, ' \
'examples: "www.a.com" or "www.a.com, www.b.com"'
parser.add_option(
'-u', '--url',
type='str',
dest='url',
help=url_help_str
)
user_agent_help_str = 'Specific the access user-agent'
parser.add_option(
'-U', '--user_agent',
type='str',
dest='user_agent',
help=user_agent_help_str
)
level_help_str = 'The option "level" represents the crawler filtering level,' \
' you can choose the number: 0(only contains the same domain links),' \
' 1(contains the same links for secondary domains),' \
' 2(contains all of links)'
parser.add_option(
'-L', '--level',
type='int',
dest='level',
help=level_help_str
)
splash_help_str = 'You can add this option if you have a splash service.' \
' When you select this option, you will use it to render the page.' \
'But do not forget to change configuration'
parser.add_option(
'-S', '--splash',
action='store_true',
dest='splash',
help=splash_help_str
)
proxy_help_str = 'You can add this option if you have a proxy service.' \
' When you select this option, you will use the proxy to access the target.' \
' But do not forget to change configuration'
parser.add_option(
'-P', '--proxy',
action='store_true',
dest='proxy',
help=proxy_help_str
)
bs64_help_str = 'Whether to use base64 encode url as the file names'
parser.add_option(
'-B', '--bs64',
action='store_true',
dest='bs64',
help=bs64_help_str
)
parser.add_option(
'-T', '--timeout',
type='int',
dest='timeout',
help='Download request timeout'
)
parser.add_option(
'--time_wait',
type='int',
dest='time_wait',
help='Time wait between page download. '
'Used to slow down the crawler speed'
)
name_help_str = 'Specifies the name of the crawler used to crawl the page'
parser.add_option(
'-N', '--name',
type='str',
dest='name',
help=name_help_str
)
options, args = parser.parse_args()
return options, args
async def crawl_one_site(semaphore, base_output_dir,
max_depth, level=0,
splash=False, proxy=False,
bs64encode_filename=False,
user_agent=None, timeout=5 * 60,
time_wait=None, spider=None):
"""递归下载一个站点"""
libc = ctypes.CDLL('libc.so.6')
libc.prctl(1, 15)
# event_loop = asyncio.new_event_loop()
# asyncio.set_event_loop(event_loop)
# semaphore = asyncio.Semaphore(value=concurrent_limit)
while True:
try:
url = urls_queue.get(block=False)
except Empty:
return
print(url)
domain = cm.get_host_from_url(url)
output_dir = os.path.join(base_output_dir, domain)
if not os.path.isdir(output_dir):
os.system('mkdir -p %s' % output_dir)
if not spider:
spider = DEFAULT_SPIDER_NAME
spiders = load_spiders()
spider_obj = None
for SClass in spiders:
if SClass.__spider__ == spider:
spider_obj = SClass(
site=url,
output_dir=output_dir,
max_depth=max_depth,
semaphore=semaphore,
level=level,
splash=splash,
proxy=proxy,
bs64encode_filename=bs64encode_filename,
user_agent=user_agent,
timeout=timeout,
time_wait=time_wait
)
break
if not spider_obj:
logger.warning('No spider matched for url: {}, '
'whose s_name is: {}'.format(url, spider))
return
pipelines = load_pipelines()
for PClass in pipelines:
if PClass.__spider__ == spider:
pipeline = PClass()
for method in dir(pipeline):
if method.startswith('pipe_'):
# 拥有可执行管道方法则添加管道
spider_obj.add_pipeline(pipeline)
break
try:
await spider_obj.run()
except:
logger.error('Error occurred while running: '
'{}'.format(traceback.format_exc()))
async def run_concurrent(semaphore, base_output_dir,
max_depth, level=0,
splash=False, proxy=False,
bs64encode_filename=False,
user_agent=None, timeout=5 * 60,
time_wait=None, spider=None):
tasks = []
for idx in range(1024):
tasks.append(
crawl_one_site(
semaphore, base_output_dir, max_depth, level,
splash, proxy, bs64encode_filename,
user_agent, timeout, time_wait, spider
)
)
await asyncio.gather(*tasks)
def entrance(concurrent, base_output_dir,
max_depth, level=0,
splash=False, proxy=False,
bs64encode_filename=False,
user_agent=None, timeout=5 * 60,
time_wait=None, spider=None):
event_loop = asyncio.get_event_loop()
semaphore = asyncio.Semaphore(concurrent)
event_loop.run_until_complete(
run_concurrent(
semaphore, base_output_dir, max_depth, level,
splash, proxy, bs64encode_filename,
user_agent, timeout, time_wait, spider
)
)
class Worker(Daemon):
"""
工作者
"""
def __init__(self, urls, base_output_dir,
max_depth, concurrent_limit,
work_dir=script_path, daemon=False,
level=0, splash=False, proxy=False,
bs64encode_filename=False, processes=None,
user_agent=None, timeout=5 * 60,
time_wait=None, spider=None):
self.urls = urls
self.base_output_dir = base_output_dir
self.max_depth = max_depth
self.concurrent_limit = concurrent_limit
self.level = level
self.splash = splash
self.proxy = proxy
self.bs64encode_filename = bs64encode_filename
self.processes = processes
self.user_agent = user_agent
self.timeout = timeout
self.time_wait = time_wait
self.spider = spider
pid_file = os.path.join(conf.PID_DIR, 'page_collect.pid')
super(Worker, self).__init__(pid_file, work_dir, daemon=daemon)
def _run(self):
"""
入口
:return:
"""
start_time = time.time()
today_str = str(date.today()).replace('-', '')
log_path = os.path.join(conf.LOG_DIR, ''.join(['page_collect_', today_str, '.log']))
logger.add(
sink=log_path,
level='INFO',
enqueue=True,
rotation='200 MB'
)
logger.info('Start crawler, the url list: %s' % self.urls)
for url in self.urls:
urls_queue.put(url)
if not self.processes:
self.processes = DEFAULT_PROCESSES
pool = Pool(self.processes)
for idx in range(self.processes):
pool.apply_async(
entrance,
args=(self.concurrent_limit,
self.base_output_dir,
self.max_depth, self.level,
self.splash, self.proxy,
self.bs64encode_filename,
self.user_agent, self.timeout,
self.time_wait, self.spider)
)
# for url in self.urls:
# pool.apply_async(
# crawl_one_site,
# args=(url, self.base_output_dir, self.max_depth,
# self.concurrent_limit, self.level,
# self.splash, self.proxy, self.bs64encode_filename,
# self.user_agent, self.timeout, self.time_wait, self.spider)
# )
pool.close()
pool.join()
total_count = 0
success_count = 0
useless_count = 0
while True:
try:
count_tuple = message_queue.get_nowait()
total_count += count_tuple[0]
success_count += count_tuple[1]
useless_count += count_tuple[2]
except Empty:
break
end_time = time.time()
expense_time = end_time - start_time
speed = total_count / expense_time
logger.info(
'The base output dir is: {}, you can find all'
' results from it by domain'.format(
os.path.abspath(self.base_output_dir))
)
logger.info(
"Crawl finished, total expense time: {}s, "
"total download: {}, success: {}, speed: {}/s".format(
end_time - start_time, total_count,
success_count, speed
)
)
# 结束之后去掉pid文件
if os.path.isfile(self._pidfile):
os.remove(self._pidfile)
def validate_url(domain):
"""
检验domain是否合法
:param domain: www.sangfor.com.cn
:return:
"""
if not isinstance(domain, str):
logger.warning('Invalid domain input, it must be string type,'
' but now is: %s' % type(domain))
return False
if not re.match('.+\..+', domain):
logger.info('Invalid domain: %s' % domain)
return False
return True
def parse_line(path):
"""
解析存放待搜索域名列表的文件
:param path: 文件路径
:return:
"""
ret = []
with open(path, 'r') as fr:
lines = fr.readlines()
for line in lines:
line = line.strip('\n').strip('\r')
if validate_url(line):
ret.append(line)
return ret
def main():
"""
主函数
:return:
"""
options, args = parse_args()
input_path = conf.DEFAULT_INPUT_PATH
base_output_dir = conf.DEFAULT_OUTPUT_DIR
concurrent_limit = conf.CONCURRENT_LIMIT
max_depth = conf.MAX_DEPTH
level = conf.LEVEL
input_url = ''
user_agent = None
processes = conf.PROCESS_NUM
timeout = conf.CRAWL_TIMEOUT
time_wait = None
name = None
if options.concurrent is not None:
concurrent_limit = options.concurrent
if options.depth is not None:
max_depth = options.depth
if options.input is not None:
input_path = options.input
if options.output is not None:
base_output_dir = options.output
if options.processes:
processes = options.processes
if options.url is not None:
input_url = options.url
if options.level is not None:
level = options.level
if options.user_agent is not None:
user_agent = options.user_agent
if options.timeout is not None:
timeout = options.timeout
if options.time_wait is not None:
time_wait = options.time_wait
if options.name is not None:
name = options.name
daemon = options.daemon
use_splash = options.splash
use_proxy = options.proxy
bs64 = options.bs64
if input_url:
url_list = [url.replace(' ', '') for url in input_url.split(',')]
else:
url_list = parse_line(input_path)
worker = Worker(
urls=url_list,
base_output_dir=base_output_dir,
max_depth=max_depth,
concurrent_limit=concurrent_limit,
daemon=daemon,
level=level,
splash=use_splash,
proxy=use_proxy,
bs64encode_filename=bs64,
processes=processes,
user_agent=user_agent,
timeout=timeout,
time_wait=time_wait,
spider=name
)
if not args:
print('------start page_collect successfully------')
worker.start()
elif len(args) == 1:
if args[0] == 'start':
print('------start page_collect successfully------')
worker.start()
elif args[0] == 'stop':
print('------stop page_collect successfully------')
worker.stop()
elif args[0] == 'restart':
print('------restart page_collect successfully------')
else:
raise ValueError('Invalid status args!')
else:
raise ValueError('Too many args input!')
if __name__ == '__main__':
main()