-
Notifications
You must be signed in to change notification settings - Fork 5
/
scrape.py
75 lines (64 loc) · 2.35 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import argparse
import base64
import json
import os
from bs4 import BeautifulSoup
import requests
def scrape(url, format_, type_):
try:
page = requests.get(url)
except requests.RequestException as rex:
print(str(rex))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = _fetch_images(soup, url)
images = _filter_images(images, type_)
_save(images, format_)
def _fetch_images(soup, base_url):
images = []
for img in soup.findAll('img'):
src = img.get('src')
if src and src.startswith('data:'):
continue
img_url = ('{base_url}/{src}'.format(base_url=base_url, src=src))
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
def _filter_images(images, type_):
if type_ == 'all':
return images
ext_map = {'png': ['.png'], 'jpg': ['.jpg', '.jpeg'],}
return [img for img in images if _matches_extension(img['name'], ext_map[type_])]
def _matches_extension(filename, extension_list):
name, extension = os.path.splitext(filename.lower())
return extension in extension_list
def _save(images, format_):
if images:
if format_ == 'img':
_save_images(images)
else:
_save_json(images)
print('Done')
else:
print('No images to save.')
def _save_images(images):
for img in images:
img_data = requests.get(img['url']).content
with open(img['name'], 'wb') as f:
f.write(img_data)
def _save_json(images):
data = {}
for img in images:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open('images.json', 'w') as ijson:
ijson.write(json.dumps(data))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Scrape a webpage.")
parser.add_argument('-t', '--type', choices=['all', 'png', 'jpg'], default='all', help='The image type we want to scrape.')
parser.add_argument('-f', '--format', choices=['img', 'json'], default='img', help='The format images are saved to.')
parser.add_argument('url', help='The URL we want to scrape for images.')
args = parser.parse_args()
scrape(args.url, args.format, args.type)