-
Notifications
You must be signed in to change notification settings - Fork 2
/
url_cat_scrapper.py
207 lines (179 loc) · 6.84 KB
/
url_cat_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
This script receives a list of URLs, searches for the category on Fortinet database
It uses a list of http proxy to bypass Fortigaurd's limit on 10 requests per minute, 200 per hour, 500 per day.
Tested: with a list of 400 open proxies, 3000 URLs can be categorized in 10min.
URLs should be entered one per line.
# cleanup file from PA:
# $ grep ^URL domains_20141021-30.txt | cut -f2 -d\" | cut -f1 -d\/ | grep -Ev "\b([0-9]{1,3}\.){3}[0-9]{1,3}\b" | sort -u | head
"""
import os
import re
import argparse
from Queue import Queue
import argparse
import urllib
import urllib2
import sys
import multiprocessing
import cookielib
import random
import pickle
import shutil
parser = argparse.ArgumentParser(description="This script receives a list of Unknown URLs from Palo Alto DB,\nsearches for the categories on Fortinet database and generates a bulk-formatted list to be submitted to PA\n")
parser.add_argument("-f", "--url-list", dest="urlList", help="plain text file containing URLs, one URL per line", metavar="FILE", required=True)
parser.add_argument("-p", "--proxy-list", dest="proxyList", help="proxy list file. Get one at your preferred free public proxy list e.g. www.gatherproxy.com. Format: proxy:port, one per line", metavar="FILE", required=True)
parser.add_argument("-c", "--captcha", dest="captchaCode", help="The value of CaptchaCode", required=False)
parser.add_argument("-s", "--session-id", dest="sessionID", help="The value of LBD_VCID_LoginCaptcha", required=False)
"""" func """
# pickle a object. WARNING: this will overwrite the dst file!
def pickleObject(Object, pickledFile):
try:
pickledData = [Object]
with open( pickledFile, "wb" ) as f:
pickle.dump( pickledData, f)
print "[*] Pickled to " + pickledFile + "."
except:
print "[I] Bummer - couldn't pickle to " + pickledFile + ". Resume won't be possible (this is weird..)"
# check if a file exists, if not creates it / "touch"
def checkFileExistence(filename):
try:
if not os.path.isfile(filename):
print "[*] " + filename + " not found. Touching " + filename + " ..."
f=open(filename, "a")
f.close()
return 1
else:
print "[I] " + filename + " found. Skipping touch."
return 1
except:
print "[I] Failed to create " + filename + ". Write access?"
return 0
def fetchCategory(proxy, urllist, urlscategorized, categorizedurlsfromdb):
try:
url = urllist.pop()
except:
#print "[I] Urllist empty - nothing to do. Returning.."
return 0
if url in categorizedurlsfromdb:
print "[*] "+url+": URL already categorized. Doing nothing..."
return 1 #nothing to do, URL is already categorized, success
else:
try:
print "[*] " + url + ": retrieving category via " + proxy + " ..."
cj = cookielib.CookieJar()
proxy_handler = urllib2.ProxyHandler({'http': 'http://' + proxy})
opener = urllib2.build_opener(proxy_handler, urllib2.HTTPCookieProcessor(cj))
user_agent = random.choice(open(USER_AGENT_FILE).readlines()).strip()
opener.addheaders = [('User-agent', user_agent)]
req = opener.open("http://www.fortiguard.com/ip_rep/index.php?data=" + url + "?", None, timeout=TIMEOUT)
html_response = req.read()
regex = '<h3 style="float: left">Category: ([a-zA-Z\- ]+)</h3>'
reg_match = re.search(regex, html_response, re.MULTILINE)
#print url, #debug purpose
#print reg_match #debug purpose
if reg_match:
category = reg_match.group(1)
# print "[*] Found! " + url + " = " + category #debug purpose
urlscategorized[url] = category
return 1
else: #couldn't match, assuming proxy issue
urllist.append(url)
return 0
#print "[" + str(n) + "] proxy = " + proxy + "\t url = " + url + "\tmessage = " + category #debug purpose
except Exception, err:
urllist.append(url)
print "[I] exception[" + str(n) + "] proxy = " + proxy + "\t url = " + url + "\tmessage = " + str(err)
return 0
""" main """
args = parser.parse_args()
urlList_pickledFile = args.urlList + ".pickled"
localURLCatdb_pickledFile = "localURLCatdb.pickled"
TIMEOUT = 10
USER_AGENT_FILE = 'user-agent_list.txt'
manager = multiprocessing.Manager()
urlList = manager.list()
urlsCategorized = manager.dict()
proxyQueue = Queue()
restoredDict = dict()
# load all urls from file into a list
try:
with open( args.urlList, 'rb') as h:
for line in h:
li=line.strip()
if not li.startswith("#"):
urlList.append(li)
except:
print "[I] Error opening/reading " + str(args.urlList) + ". Exiting..."
exit()
# if urllist is empty, quits
if not urlList:
print "[*] " + str(args.urlList) + " is empty. Exiting.."
exit()
# load all proxies from file into a queue
try:
with open( args.proxyList, "rb") as g:
for line in g:
li=line.strip()
if not li.startswith("#"):
proxyQueue.put(li)
except:
print "[I] Error opening/reading " + str(args.proxyList) + ". Exiting..."
exit()
# if the queue is empty, quits
if proxyQueue.empty():
print "[I] " + str(args.proxyList) + " is empty. Exiting.."
exit()
# load all known urls/categories
try:
f=open(localURLCatdb_pickledFile, 'rb')
while 1:
try:
temp_dict= pickle.load(f)[0]
restoredDict.update(temp_dict)
except EOFError:
break
except Exception, err:
print "[I] General Except: " + str(err)
break
f.close()
categorizedURLsfromDB = manager.dict(restoredDict)
except:
print "[I] Unable to load " + urlList_pickledFile + " (okay for a first run)"
categorizedURLsfromDB = manager.dict() #must be
# backing up the original file (urllist)
try:
if not os.path.isfile(args.urlList + '.bkp'):
print "[*] Backing up " + str(args.urlList) + " ..."
shutil.copyfile(args.urlList, args.urlList + '.bkp')
print "[*] Done backing " + str(args.urlList) + " up..."
else:
print "[*] Backup file found for " + str(args.urlList) + ". Skipping backup..."
except:
print "[I] Failed to backup " + str(args.urlList) + " (write access?)"
exit()
# the main process
pool = multiprocessing.Pool(multiprocessing.cpu_count()*10)
while not proxyQueue.empty():
if urlList:
proxy_candidate = proxyQueue.get()
for n in range(0,10):
pool.apply_async(fetchCategory, args=(proxy_candidate, urlList, urlsCategorized, categorizedURLsfromDB,))
pool.close()
pool.join()
backupDict = dict(urlsCategorized)
if checkFileExistence(urlList_pickledFile):
pickleObject(backupDict, urlList_pickledFile)
backupDict.update(dict(categorizedURLsfromDB)) # could also iterate urlsCategorized and add the keys:values :|
if checkFileExistence(localURLCatdb_pickledFile):
pickleObject(backupDict, localURLCatdb_pickledFile)
# writes the remaining urls that were not processed
# this effectively overwrites the original file
try:
with open(args.urlList, 'w') as f:
for s in urlList:
f.write(s + "\n")
print "[*] Successfully updated " + str(args.urlList)
except:
print "[I] Error while updating " + str(args.urlList)
manager.shutdown()
del manager