-
Notifications
You must be signed in to change notification settings - Fork 1
/
getdata.py
61 lines (55 loc) · 1.78 KB
/
getdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pickle
from time import sleep
from argparse import ArgumentParser
import netkeiba
parser = ArgumentParser()
parser.add_argument('--max_page', type=int, dest='max_page', default=-1)
parser.add_argument('-i', '--interval', type=int, dest='interval', default=1)
parser.add_argument('-o', '--output', type=str, dest='output', default=None) # horsedb.pkl
args = parser.parse_args()
data = []
page = 1
crawler = netkeiba.HorseListPageCrawler(listSize=100)
print ('Getting page', page, '...')
try:
html = crawler.getFirstPage()
except Exception as e:
print (e.message)
html = None
if html:
# print html
parser = netkeiba.HorseDataParser()
data_in_page = parser.parse_horse_list(html)
if data_in_page:
data.extend(data_in_page)
# print data_in_page
for d in data_in_page:
print (d['name'])
# print len(data_in_page)
print ('Number of data=', len(data))
if args.max_page < 0 or page < args.max_page:
while crawler.haveNextPage():
sleep(args.interval)
page += 1
print ('Getting page', page, '...')
try:
html = crawler.getNextPage()
except Exception as e:
print (e.message)
html = None
if html is not None:
# print html
data_in_page = parser.parse_horse_list(html)
if data_in_page:
data.extend(data_in_page)
# print data_in_page
for d in data_in_page:
print (d['name'])
# print len(data_in_page)
print ('Number of data=', len(data))
if args.max_page >= 0 and page >= args.max_page:
break
if args.output:
with open(args.output, "w") as f:
pickle.dump(data, f)
print ('done.')