Skip to content

Commit

Permalink
Fixed a crash on some strangely formatted images
Browse files Browse the repository at this point in the history
  • Loading branch information
bartvbl committed Jul 11, 2017
1 parent 1ef60b0 commit f4367a8
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,12 +378,15 @@ def download_file(institution, url, destination_directory, session, index=None,
except Exception:
# Can occur in a case of an encoded image. If so, dump it.
if base64_png_image_url[institution] in url or base64_jpeg_image_url[institution] in url:
extension = url.split(':')[2].split(';')[0].split('/')[1]
print('\tDownloaded Base64 encoded {} image'.format(extension).encode('ascii', 'ignore'))
start_index = url.index(',') + 1
base64_encoded_file_contents = url[start_index:]
decoded_bytes = base64.b64decode(base64_encoded_file_contents)
bytesToTextFile(decoded_bytes, destination_directory + '/' + base64_encoded_file_contents[0:10] + '.' + extension)
try:
extension = url.split(':')[2].split(';')[0].split('/')[1]
print('\tDownloaded Base64 encoded {} image'.format(extension).encode('ascii', 'ignore'))
start_index = url.index(',') + 1
base64_encoded_file_contents = url[start_index:]
decoded_bytes = base64.b64decode(base64_encoded_file_contents)
bytesToTextFile(decoded_bytes, destination_directory + '/' + base64_encoded_file_contents[0:10] + '.' + extension)
except Exception:
print('Base64 Image Download Failed: unknown umage formatting. Skipping.')
return
elif url.startswith('/'):
try:
Expand Down

0 comments on commit f4367a8

Please sign in to comment.