wascan/scanner.py at master · mrschyte/wascan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import lxml.html
import itertools
import requests
import urllib.parse
import traceback
import argparse

from util.lists import *
from util.queue import *
from util.nhash import *
from util.urls import *

def find_links(tree, url):
    paths = ['//@href', '//@action', '//@src']

    for path in paths:
        for href in tree.xpath(path):
            if urllib.parse.urlparse(href).scheme == '':
                href = urllib.parse.urljoin(url, href)
            yield urlnorm(href)

class Scanner(object):
    def __init__(self, base=None):
        self.urls = UniqueQueue()
        self.urls.norm = urlnorm
        self.base = base

        self.session = requests.session()
        self.crawled = {}

    def on_request(self, url):
        pass

    def on_response(self, response, chash, unique):
        pass

    def add_crawled(self, chash, url):
        if chash not in self.crawled:
            self.crawled[chash] = set()

        self.crawled[chash].add(url)

    def was_crawled(self, chash):
        return chash in self.crawled

    def scan(self, url, wordlist=None):

        if self.base == None:
            self.base = url

        if wordlist == None:
            wordlist = []

        self.urls.put(url)

        while not self.urls.empty():

            try:
                url = self.urls.get()
                self.on_request(url)

                respn = self.session.get(url)
                chash = simhash(respn.content)

                if respn.ok:
                    self.on_response(response=respn, chash=chash, unique=(not self.was_crawled(chash)))
                    self.add_crawled(chash, respn.url)

                    ctype = respn.headers.get('content-type')
                    links = []

                    if 'html' in ctype or 'xml' in ctype:
                        xtree = lxml.html.fromstring(respn.text)
                        links = itertools.chain(links, find_links(xtree, url))

                    links = itertools.chain(links, urlprefixes(url))

                    for link in links:
                        if urlbase(self.base, link):
                            self.urls.put(link)

            except Exception as ex:
                traceback.print_exc()
                pass

        # do bruteforce passes
        m = len(wordlist)
        n = 0

        while m != n:
            m = len(self.crawled.values())
            for path in set(flatten(self.crawled.values())):
                for word in wordlist:
                    self.scan(urllib.parse.urljoin(path, word), [])
            n = len(self.crawled.values())