-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscanner.py
More file actions
95 lines (70 loc) · 2.55 KB
/
scanner.py
File metadata and controls
95 lines (70 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import lxml.html
import itertools
import requests
import urllib.parse
import traceback
import argparse
from util.lists import *
from util.queue import *
from util.nhash import *
from util.urls import *
def find_links(tree, url):
paths = ['//@href', '//@action', '//@src']
for path in paths:
for href in tree.xpath(path):
if urllib.parse.urlparse(href).scheme == '':
href = urllib.parse.urljoin(url, href)
yield urlnorm(href)
class Scanner(object):
def __init__(self, base=None):
self.urls = UniqueQueue()
self.urls.norm = urlnorm
self.base = base
self.session = requests.session()
self.crawled = {}
def on_request(self, url):
pass
def on_response(self, response, chash, unique):
pass
def add_crawled(self, chash, url):
if chash not in self.crawled:
self.crawled[chash] = set()
self.crawled[chash].add(url)
def was_crawled(self, chash):
return chash in self.crawled
def scan(self, url, wordlist=None):
if self.base == None:
self.base = url
if wordlist == None:
wordlist = []
self.urls.put(url)
while not self.urls.empty():
try:
url = self.urls.get()
self.on_request(url)
respn = self.session.get(url)
chash = simhash(respn.content)
if respn.ok:
self.on_response(response=respn, chash=chash, unique=(not self.was_crawled(chash)))
self.add_crawled(chash, respn.url)
ctype = respn.headers.get('content-type')
links = []
if 'html' in ctype or 'xml' in ctype:
xtree = lxml.html.fromstring(respn.text)
links = itertools.chain(links, find_links(xtree, url))
links = itertools.chain(links, urlprefixes(url))
for link in links:
if urlbase(self.base, link):
self.urls.put(link)
except Exception as ex:
traceback.print_exc()
pass
# do bruteforce passes
m = len(wordlist)
n = 0
while m != n:
m = len(self.crawled.values())
for path in set(flatten(self.crawled.values())):
for word in wordlist:
self.scan(urllib.parse.urljoin(path, word), [])
n = len(self.crawled.values())