Skip to content

Commit

Permalink
Fixed crash when dumping an online test as a teacher
Browse files Browse the repository at this point in the history
  • Loading branch information
bartvbl committed Jun 26, 2017
1 parent e9b7e01 commit 9eb9ee3
Showing 1 changed file with 137 additions and 131 deletions.
268 changes: 137 additions & 131 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,170 +1021,176 @@ def processOnlineTest(institution, pathThusFar, nttUrl, nttID, session):
dumpDirectory = sanitisePath(dumpDirectory)
dumpDirectory = makeDirectories(dumpDirectory)

results_root_element = online_test_document.get_element_by_id('ctl39_ResultsTable_table')
test_info_elements = online_test_document.find_class('itsl-detailed-info')
has_submitted_answer = True
try:
results_root_element = online_test_document.get_element_by_id('ctl39_ResultsTable_table')
except Exception:
has_submitted_answer = False


# Extracting test information
info_file_contents = ''
if has_submitted_answer:
test_info_elements = online_test_document.find_class('itsl-detailed-info')

for info_element in test_info_elements:
for info_list_element in info_element:
info_file_contents += info_list_element[0].text_content() + ' ' + info_list_element[1].text_content() + '\n'
# Extracting test information
info_file_contents = ''

bytesToTextFile(info_file_contents.encode('utf-8'), dumpDirectory + '/Test Information' + output_text_extension)
for info_element in test_info_elements:
for info_list_element in info_element:
info_file_contents += info_list_element[0].text_content() + ' ' + info_list_element[1].text_content() + '\n'

# Download test answers
bytesToTextFile(info_file_contents.encode('utf-8'), dumpDirectory + '/Test Information' + output_text_extension)

table_headers = []
for index, table_row in enumerate(results_root_element):
# results_root_element[0] is a <caption> element
if index == 0:
continue
# results_root_element[1] contains table headers
elif index == 1:
for table_header in results_root_element[1]:
table_headers.append(table_header.text_content())
continue
# The remainder of the table rows are attempts we want to save.
attempt_file_contents = ''

attempt_index = index - 1
details_URL = None

for cell_index, table_cell in enumerate(table_row):
table_cell_name = table_headers[cell_index]
table_cell_content = table_cell.text_content()
if table_cell_name == 'Details':
details_URL = table_cell[0].get('href')
else:
attempt_file_contents += table_cell_name + ': ' + table_cell_content + '\n'

# Only dumping the details afterwards so that we get a nice header in the output file containing the attempt details.
if details_URL is not None:
details_page_response = session.get(itslearning_root_url[institution] + details_URL, allow_redirects=True)
details_page_document = fromstring(details_page_response.text)
# Download test answers

table_headers = []
for index, table_row in enumerate(results_root_element):
# results_root_element[0] is a <caption> element
if index == 0:
continue
# results_root_element[1] contains table headers
elif index == 1:
for table_header in results_root_element[1]:
table_headers.append(table_header.text_content())
continue
# The remainder of the table rows are attempts we want to save.
attempt_file_contents = ''

attempt_file_contents += '\n'
attempt_index = index - 1
details_URL = None

for cell_index, table_cell in enumerate(table_row):
table_cell_name = table_headers[cell_index]
table_cell_content = table_cell.text_content()
if table_cell_name == 'Details':
details_URL = table_cell[0].get('href')
else:
attempt_file_contents += table_cell_name + ': ' + table_cell_content + '\n'

# Only dumping the details afterwards so that we get a nice header in the output file containing the attempt details.
if details_URL is not None:
details_page_response = session.get(itslearning_root_url[institution] + details_URL, allow_redirects=True)
details_page_document = fromstring(details_page_response.text)

attempt_file_contents += '\n'

# Dumping result data shown on page
for result_details_element in details_page_document.find_class('ntt-test-result-status-label'):
result_details_label = result_details_element.text_content().strip()
result_details_value = result_details_element.getnext().text_content().strip().replace(' ', ' ').replace('\n', '')
attempt_file_contents += result_details_label + ' ' + result_details_value + '\n'
# Dumping result data shown on page
for result_details_element in details_page_document.find_class('ntt-test-result-status-label'):
result_details_label = result_details_element.text_content().strip()
result_details_value = result_details_element.getnext().text_content().strip().replace(' ', ' ').replace('\n', '')
attempt_file_contents += result_details_label + ' ' + result_details_value + '\n'

total_assessment_element = details_page_document.find_class('ccl-assess-badge')
if len(total_assessment_element) > 0:
attempt_file_contents += 'Assessment: ' + total_assessment_element[0].text_content().strip() + '\n'
total_assessment_element = details_page_document.find_class('ccl-assess-badge')
if len(total_assessment_element) > 0:
attempt_file_contents += 'Assessment: ' + total_assessment_element[0].text_content().strip() + '\n'

last_page_dumped = False
last_page_dumped = False

attempt_file_contents += '\n-------------- Answers --------------\n\n'
attempt_file_contents += '\n-------------- Answers --------------\n\n'

question_index = 1
page_index = 1
question_table_body = details_page_document.get_element_by_id('ctl00_ContentPlaceHolder_ResultsGrid_TB')
question_index = 1
page_index = 1
question_table_body = details_page_document.get_element_by_id('ctl00_ContentPlaceHolder_ResultsGrid_TB')

attemptDirectory = dumpDirectory + '/Attempt ' + str(attempt_index)
attemptDirectory = makeDirectories(attemptDirectory)
attemptDirectory = dumpDirectory + '/Attempt ' + str(attempt_index)
attemptDirectory = makeDirectories(attemptDirectory)

while len(question_table_body) > 0:
for question_element in question_table_body:
print('\tSaving question', question_index)
question_link = itslearning_root_url[institution] + question_element[0][1].get('href')

question_response = session.get(question_link, allow_redirects=True)
question_document = fromstring(question_response.text)
while len(question_table_body) > 0:
for question_element in question_table_body:
print('\tSaving question', question_index)
question_link = itslearning_root_url[institution] + question_element[0][1].get('href')
question_response = session.get(question_link, allow_redirects=True)
question_document = fromstring(question_response.text)

question_title = question_element[1].text_content()
question_result = question_document.find_class('question-result')[0].text_content()
question_title = question_element[1].text_content()
question_result = question_document.find_class('question-result')[0].text_content()

attempt_file_contents += 'Question ' + str(question_index) + ': ' + question_title + '\n\n'
attempt_file_contents += question_result + '\n'
attempt_file_contents += 'Question ' + str(question_index) + ': ' + question_title + '\n\n'
attempt_file_contents += question_result + '\n'

try:
question_options_table = question_document.get_element_by_id('qti-choiceinteraction-container')
try:
question_options_table = question_document.get_element_by_id('qti-choiceinteraction-container')

for option_index, question_options_row in enumerate(question_options_table):
if option_index == 0:
attempt_file_contents += question_options_row.text_content() + '\n'
continue
if question_options_row.get('class') is not None and 'checkedrow' in question_options_row.get('class'):
attempt_file_contents += 'Option (selected): ' + question_options_row.text_content() + '\n'
else:
attempt_file_contents += 'Option: ' + question_options_row.text_content() + '\n'
except Exception:
attempt_file_contents += 'Non Multiple Choice question. You can find the entire page in the folder titled "Attempt {}".\n'.format(question_index)

# Need to download images from hotspot questions
content_divs = question_document.find_class('content')
for content_div in content_divs:
for image_tag in content_div.iterfind(".//img"):

image_URL = image_tag.get('src')

# For some reason there can be images containing nothing on a page. No idea why.
if image_URL is None:
continue
for option_index, question_options_row in enumerate(question_options_table):
if option_index == 0:
attempt_file_contents += question_options_row.text_content() + '\n'
continue
if question_options_row.get('class') is not None and 'checkedrow' in question_options_row.get('class'):
attempt_file_contents += 'Option (selected): ' + question_options_row.text_content() + '\n'
else:
attempt_file_contents += 'Option: ' + question_options_row.text_content() + '\n'
except Exception:
attempt_file_contents += 'Non Multiple Choice question. You can find the entire page in the folder titled "Attempt {}".\n'.format(question_index)

# Special case for relative URL's: drop the It's Learning root URL in front of it
if not image_URL.startswith('http'):
image_URL = itslearning_root_url[institution] + image_URL
# Need to download images from hotspot questions
content_divs = question_document.find_class('content')
for content_div in content_divs:
for image_tag in content_div.iterfind(".//img"):

filename = download_file(institution, image_URL, attemptDirectory, session)
attempt_file_contents += 'Attachment image: ' + filename + '\n'
image_URL = image_tag.get('src')

delay()

bytesToTextFile(question_response.text.encode('utf-8'), attemptDirectory + '/Question ' + str(question_index) + '.html')
# For some reason there can be images containing nothing on a page. No idea why.
if image_URL is None:
continue

attempt_file_contents += '\n'
# Special case for relative URL's: drop the It's Learning root URL in front of it
if not image_URL.startswith('http'):
image_URL = itslearning_root_url[institution] + image_URL

filename = download_file(institution, image_URL, attemptDirectory, session)
attempt_file_contents += 'Attachment image: ' + filename + '\n'

delay()
question_index += 1
delay()

bytesToTextFile(question_response.text.encode('utf-8'), attemptDirectory + '/Question ' + str(question_index) + '.html')

print('\tPage finished, loading next one.')
postback_form = None
for form in details_page_document.forms:
if '__EVENTTARGET' in form.fields:
postback_form = form
break
attempt_file_contents += '\n'

if postback_form is None:
raise Error('No postback form found on page!\nURL: ' + details_URL)

# Extracting the destination URL from the form action field.
# The URL starts with ./, so I'm removing the dot to obtain a complete URL
form_action_url = itslearning_root_url[institution] + details_URL
delay()
question_index += 1

page_index += 1
print('\tPage finished, loading next one.')
postback_form = None
for form in details_page_document.forms:
if '__EVENTTARGET' in form.fields:
postback_form = form
break

# This specific page requires these additional elements to do the postback
postback_form = details_page_document.forms[0]
postback_form.fields['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder$ResultsGrid'
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HPN'] = str(page_index)
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HSE'] = ''
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HGC'] = ''
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HFI'] = ''
if postback_form is None:
raise Error('No postback form found on page!\nURL: ' + details_URL)

post_data = convert_lxml_form_to_requests(postback_form)
# Extracting the destination URL from the form action field.
# The URL starts with ./, so I'm removing the dot to obtain a complete URL
form_action_url = itslearning_root_url[institution] + details_URL

# Submitting the form to obtain the next page
headers = {}
headers['Referer'] = form_action_url
page_index += 1

details_page_response = session.post(form_action_url, headers=headers, data=post_data, allow_redirects=True)
details_page_document = fromstring(details_page_response.text)
question_table_body = details_page_document.get_element_by_id('ctl00_ContentPlaceHolder_ResultsGrid_TB')
delay()
print('\tAll pages have been loaded.')
else:
print('ERROR: COULD NOT FIND DETAILS PAGE OF ONLINE TEST.')
print('NO DETAILS WILL BE SAVED OF THIS TEST.')
# This specific page requires these additional elements to do the postback
postback_form = details_page_document.forms[0]
postback_form.fields['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder$ResultsGrid'
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HPN'] = str(page_index)
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HSE'] = ''
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HGC'] = ''
postback_form.fields['ctl00$ContentPlaceHolder$ResultsGrid$HFI'] = ''

post_data = convert_lxml_form_to_requests(postback_form)

# Submitting the form to obtain the next page
headers = {}
headers['Referer'] = form_action_url

details_page_response = session.post(form_action_url, headers=headers, data=post_data, allow_redirects=True)
details_page_document = fromstring(details_page_response.text)
question_table_body = details_page_document.get_element_by_id('ctl00_ContentPlaceHolder_ResultsGrid_TB')
delay()
print('\tAll pages have been loaded.')
else:
print('ERROR: COULD NOT FIND DETAILS PAGE OF ONLINE TEST.')
print('NO DETAILS WILL BE SAVED OF THIS TEST.')


bytesToTextFile(attempt_file_contents.encode('utf-8'), dumpDirectory + '/Attempt ' + str(attempt_index) + output_text_extension)
bytesToTextFile(attempt_file_contents.encode('utf-8'), dumpDirectory + '/Attempt ' + str(attempt_index) + output_text_extension)


def processFile(institution, pathThusFar, fileURL, session):
Expand Down

0 comments on commit 9eb9ee3

Please sign in to comment.