-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
40 lines (31 loc) · 1.07 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# coding=utf-8
__author__ = 'kasper'
# Crawler
import requests as req
import time
from Log import Log
class Crawler:
def __init__(self, url, chPart, offset, ds):
self.url = url
self.chPart = chPart
self.offset = offset
self.docStore = ds
self.log = Log('logfile.log')
# Baranje do server za strana, ja vrakjam sodrzinata od stranta
def request(self, pgid):
print self.url + str(pgid)
try:
page = req.get(self.url + str(pgid) + '/')
if page.content == 'Error':
self.log.log(str(time.time()) + ' - Request to file ' + str(pgid) + ' failed.')
return None
file = open(self.docStore + str(pgid) + '.html', 'w')
file.write(page.content)
file.close
except IndexError, e:
page = None
self.log.log(str(time.time()) + ' - Request to file ' + pgid + ' failed. ' + str(e.args))
return page.content
def crawl(self):
for i in range(self.offset):
self.request(self.chPart + i)