-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
126 lines (84 loc) · 3.07 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Name: Will Shellabarger
#uniqname: wnshell
from bs4 import BeautifulSoup
import urlparse
import sys, os, urllib2
import time
def appropriateJoke(tags):
if "racist" in tags or "dirty" in tags or "sex" in tags:
return False
return True
if __name__ == "__main__":
#timekeeping
start_time = time.time()
#get seed document and number to stop at
seedURLs = sys.argv[1]
numToCrawl = int(sys.argv[2])
#initialize empty list of URLs to try
urlList = []
#keep list of URLs that were actually visited
visited = []
#print directed edge pairs for pagerank.py
jokes = open('jokes.txt', 'w')
#place all seed URLs into list
with open(seedURLs) as file:
urlList = file.readlines()
urlList = [x.strip() for x in urlList]
#iterate through URL frontier and collect links
i = 0
jokeIDs = []
while i < len(urlList) and len(visited) < numToCrawl and len(jokeIDs) < 3000:
#how far along are we
if(len(visited) % 100 == 0):
print "---- " + str(len(visited)) + " pages visited ----\n"
#cache url for faster indexing
url = urlList[i]
#set timeout for visiting links to speed up crawler
try:
webPage = urllib2.urlopen(url, timeout=3.05)
except:
#if http doesn't work, try with https
try:
urlSecure = url.replace('http', 'https', 1)
webPage = urllib2.urlopen(urlSecure, timeout=3.05)
except:
#continue past this page if unable to open
i += 1
continue
#only crawl HTML pages
if 'html' in webPage.headers.getheader('Content-Type'):
content = webPage.read()
soup = BeautifulSoup(content, 'html.parser')
#if an HTML page, add to list of 'visited' links
visited.append(url)
for joke in soup.find_all("div", { "class" : "oneliner" }):
jokeID = joke.find('b').get('id')
# Taking out inappropriate jokes
tags = joke.find('span', {'class' : 'links'})
jokeTags = tags.text.replace("Tags: ", "").encode('utf-8').strip() + '\n'
if appropriateJoke(jokeTags):
if jokeID not in jokeIDs:
jokeIDs.append(jokeID)
jokes.write(joke.find('p').text.encode('utf-8').strip() + "|")
tags = joke.find('span', {'class' : 'links'})
jokes.write(tags.text.replace("Tags: ", "").encode('utf-8').strip() + '\n')
#find all href links in that page
for link in soup.find_all('a', href=True):
#incorporate relative paths from links
link = urlparse.urljoin(url, link['href'])
#remove fragments
link = urlparse.urldefrag(link)[0]
#normalize all links to use http rather than https, remove trailing slash
link = link.replace('https', 'http', 1).strip('/')
#get rid of unicode from bs4
link = link.encode('utf-8')
#only add to frontier if eecs.umich domain and not already seen and not 'www' duplicate (probably causing slow runtime)
if 'onelinefun' in urlparse.urlparse(link).netloc:
#append to end of list for breadth first search
urlList.append(link)
i += 1
out = open("crawler.output", "w")
for link in visited:
out.write(link + '\n')
print "---- " + str(len(visited)) + " pages visited ----"
print "--- %s seconds ---" % (time.time() - start_time)