forked from lawongsta/scispark
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathhtmlPageChecker.py
254 lines (223 loc) · 8.17 KB
/
htmlPageChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from pyvirtualdisplay import Display
from tika import parser
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from reppy.cache import RobotsCache
from reppy.exceptions import ServerError
import sys, os, time, getopt, re, collections, nltk
'''
Purpose:: To check the quality of website pages that have been released to public
Requires:: selenium, PhantomJS, pyvirtualdisplay, tika, reppy, nltk
'''
class testHtmlPage(object):
def __init__(self):
self.display = Display(visible=0, size=(800, 600))
self.display.start()
self.driver = webdriver.PhantomJS()
#self.driver = webdriver.Firefox()
self.verificationErrors = []
self.accept_next_alert = True
self.log = os.getcwd()+'/htmlPageChecker.log'
self.f = open(self.log,'ab+')
self.NWORDS = ''
self.alphabet = ''
def is_element_present(self, what):
try:
return self.driver.find_elements_by_tag_name(what)
except NoSuchElementException, e:
return False
def is_alert_present(self):
try:
self.driver.switch_to_alert()
except NoAlertPresentException, e:
return False
return True
def close_alert_and_get_its_text(self):
try:
alert = self.driver.switch_to_alert()
alert_text = alert.text
if self.accept_next_alert:
alert.accept()
else:
alert.dismiss()
return alert_text
finally: self.accept_next_alert = True
def tearDown(self):
self.f.close()
self.driver.quit()
self.display.stop()
def check_img_alt_tags(self, page):
driver = self.driver
driver.implicitly_wait(3)
driver.get(page)
allImgTags = self.is_element_present('img')
self.f.write('--- checking for alt text in %s\n' %page)
altTagsPassed = True
if allImgTags:
for img in allImgTags:
try:
currAltText = img.get_attribute('alt').encode('utf-8')
if not(currAltText and currAltText.strip()):
print img.get_attribute('src'),': img tag alt attribute missing text'
self.f.write('%s :img tag alt attribute missing text\n' %img.get_attribute('src'))
altTagsPassed = False
except:
print 'Some img tags are missing alt TEXT attribute. Please check!'
self.f.write('Some img tags are missing alt TEXT attribute. Please check!\n')
return False
if altTagsPassed == True:
print 'img alt TEXT test cleared'
self.f.write('img alt TEXT test cleared\n')
else:
print 'This page has no img tags'
self.f.write('This page has no img tags\n')
return altTagsPassed
def check_pdf_text(self,page):
driver = self.driver
driver.implicitly_wait(2)
driver.get(page)
allaTags = self.is_element_present('a')
self.f.write('--- checking for pdf links in %s\n' %page)
pdf = False
adobe = False
adobeDload = False
if allaTags:
for a in allaTags:
try:
if a.get_attribute('href'):
currhref = (a.get_attribute('href').encode('utf-8')).lower()
if '.pdf' in currhref:
pdf = True
elif 'http://www.adobe.com' in currhref:
adobe = True
elif 'https://get.adobe.com/reader' in currhref:
adobeDload = True
except:
return False
if pdf == True and adobe == False and adobeDload == False:
print 'pdf found but no mention of Adobe or download link for Adobe'
self.f.write('pdf found but no mention of Adobe or download link for Adobe\n')
elif pdf == True and adobe == True and adobeDload == False:
print 'pdf found. Adobe is mentioned, but download link for Adobe missing'
self.f.write('pdf found. Adobe is mentioned, but download link for Adobe missing\n')
elif pdf == True and adobe == False and adobeDload == True:
print 'pdf found. Link to download Adobe found, but not mention of Adobe needed'
self.f.write('pdf found. Link to download Adobe found, but not mention of Adobe needed\n')
else:
print 'pdf check cleared'
self.f.write('pdf check cleared\n')
else:
print 'This page has no a tags'
self.f.write('This page has no a tags\n')
return True
def check_for_robot_access(self, page):
self.f.write('--- checking for robots %s\n' %page)
robots = RobotsCache()
try:
if robots.allowed(page+'robots.txt', 'my-agent'):
print 'robots allowed'
self.f.write('robots allowed. \n')
return True
except ServerError, r:
print 'error ', r
return False
def spell_checker(self, page, stopwordsList):
driver = self.driver
driver.implicitly_wait(2)
driver.get(page)
self.f.write('--- checking for spelling %s\n' %page)
allTextOnPage = parser.from_file(page)['content'].encode('utf-8')
allTextOnPage = re.findall('[a-z]+', allTextOnPage.lower())
stopwordsList.extend(stopwords.words('english'))
allTextOnPage = [w for w in allTextOnPage if not w in stopwordsList]
for word in allTextOnPage:
if not wordnet.synsets(word):
print 'Is this correct? ', word
self.f.write('Is this word correct? %s\n' %word)
def check_images_exist(self, page):
driver = self.driver
driver.implicitly_wait(2)
driver.get(page)
allImgTags = self.is_element_present('img')
self.f.write('--- checking images references in %s\n' %page)
allSources = True
if allImgTags:
for img in allImgTags:
try:
location = img.get_attribute('src').encode('utf-8')
if not(location and location.strip()):
print img.get_attribute('src'),': image source text is missing'
self.f.write('image source text is missing. \n')
allSources = False
elif not os.path.isfile((os.path.normpath(location)).split(':')[1]):
print img.get_attribute('src'),': image source is missing'
self.f.write('%s :image source missing text\n' %img.get_attribute('src'))
allSources = False
except:
print img, ' img tag is missing src attribute. Please check!'
self.f.write('%s img tags is missing src attribute. Please check!\n' %img)
allSources = False
if allSources == True:
print 'image sources exists test cleared'
self.f.write('image sources exists test cleared\n')
else:
print 'This page has no img tags'
self.f.write('%s has no img tags\n' %page)
return allSources
def main(argv):
reload(sys)
sys.setdefaultencoding('utf8')
pages = []
baseUrl = 'https://scispark.jpl.nasa.gov/'
dictionary = [line.rstrip() for line in open('specialDict.txt')]
try:
opts, args = getopt.getopt(argv,"hp:")
if len(opts) == 1:
for opt, arg in opts:
if opt in '-h':
print 'python htmlPageCheker.py -p <[htmlPage]>'
sys.exit()
elif opt in '-p':
if '*' in arg:
pages = [arg.split('*')[0]+'/'+i for i in os.listdir(arg.split('*')[0]) if 'html' in i]
else:
pages = [i for i in arg.split(',')]
except:
print 'python htmlPageCheker.py -p <[htmlPage]>'
# #--- for the robot.txt check ---
# t = testHtmlPage()
# testHtmlPage.check_for_robot_access(t, baseUrl)
# testHtmlPage.tearDown(t)
# #---
for eachPage in pages:
print 'working on ',eachPage
t = testHtmlPage()
testHtmlPage.check_images_exist(t, eachPage)
testHtmlPage.check_img_alt_tags(t, eachPage)
testHtmlPage.check_pdf_text(t, eachPage)
testHtmlPage.spell_checker(t, eachPage, dictionary)
testHtmlPage.tearDown(t)
if __name__ == "__main__":
main(sys.argv[1:])