-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDomainScrape.py
86 lines (78 loc) · 2.56 KB
/
DomainScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
A script that scrapes for optimal, available, domains
@author Matt Garbis
dependencies:
word frequency list ('rank\tword\tcount')
list of tlds
"""
import heapq
import time
from datetime import timedelta, datetime
from urllib2 import urlopen
import urllib2
import sys
import socket
def run(FreqFile = '10000mostfreqwords.txt', tldFile = 'tld.txt'):
word_by_freq = createFreqHeap(FreqFile)
tlds = createTLDdict(tldFile)
domainlist = getDomains(word_by_freq, tlds)
finaldomains = getUnusedDomains(domainlist, 1) # 1 minute limit
print finaldomains
def createFreqHeap(FreqFile = '10000mostfreqwords.txt'):
word_by_freq = []
with open(FreqFile) as freq:
for line in freq:
word = line[line.index('\t')+1:].split('\t')[0]
rank = line[:line.index('\t')]
heapq.heappush(word_by_freq, (rank, word))
return word_by_freq
def createTLDdict(tldFile = 'tld.txt'):
tlds = dict()
with open(tldFile) as tld:
for line in tld:
line = line.split('\n')[0].lower()
tlds[line] = True
return tlds
def getDomains(word_by_freq, tlds):
done = False
domainlist = []
while not done:
try:
word = heapq.heappop(word_by_freq)
lasttwo, lastthree = word[1][-2:], word[1][-3:]
except IndexError:
done = True
try:
if tlds[lasttwo] == True and len(word[1]) >= 5:
domainlist.append(word[1][:-2]+'.'+lasttwo)
if tlds[lastthree] == True and len(word[1]) >= 6:
domainlist.append(word[1][:-3]+'.'+lastthree)
except KeyError:
continue
return domainlist
def getUnusedDomains(domainlist, minlimit = 1):
domainlist.reverse()
done, finaldomains = False, []
minlim = datetime.now() + timedelta(minutes = minlimit)
while not done and not tooLong(minlim, minlimit):
if len(domainlist) <= 1:
done = True
else:
domain = domainlist.pop()
print domain
req = urllib2.Request('http://'+domain)
try:
socket.setdefaulttimeout(8)
urlopen(req)
except Exception, e:
print e.reason
if e.reason[0] == 61 and not tooLong(minlim, minlimit):
finaldomains.append(domain)
return finaldomains
def tooLong(minlim, mins):
percent = (1-(minlim - datetime.now()).total_seconds()/(60*mins))*100
sys.stdout.write("%3d%%\r" % percent)
sys.stdout.flush()
return datetime.now() > minlim
if __name__ == '__main__':
run()