-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecommerce.py
97 lines (71 loc) · 2.43 KB
/
ecommerce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import requests
from bs4 import BeautifulSoup
BASE_URL = 'https://www.webscraper.io/'
url = BASE_URL + 'test-sites/e-commerce/static/computers/laptops'
def get_dom(url):
res = requests.get(url)
dom = BeautifulSoup(res.text, 'lxml')
return dom
def get_product(product, category):
image = product.select_one('.img-responsive')
title = product.select_one('.title')
price = product.select_one('.pull-right.price')
rating = product.select_one('[data-rating]')
description = product.select_one('.description')
price = price.text.replace('$', '')
title = title.get('title')
if category == '' or title == '':
print(category, title)
return {}
return {
'category': category,
'title': title,
'image': BASE_URL + image.get('src', '/#'),
'price': float(price),
'rating': int(rating.get('data-rating', 0)),
'description': description.text
}
def get_category(dom):
category = dom.select_one('.page-header')
return category.text if category != None else ''
def get_products(dom, result):
products = dom.select('.thumbnail')
category = get_category(dom)
while True:
for product in products:
p = get_product(product, category)
if p != {}:
result.append(p)
if can_paginate(dom):
next_url = dom.select_one('[rel=next]')
next_url = next_url.get('href', '')
dom = get_dom(next_url)
category = get_category(dom)
products = dom.select('.thumbnail')
else:
return result
def can_paginate(dom):
return dom.select_one('[rel=next]') != None
def run(dom):
result = []
categories = dom.select('#side-menu a')
for category in categories:
url = BASE_URL + category.get('href', '')
dom = get_dom(url)
get_products(dom, result)
subcategories = dom.select('.subcategory-link')
if subcategories:
for subcategory in subcategories:
url = BASE_URL + subcategory.get('href', '')
dom = get_dom(url)
get_products(dom, result)
return result
def can_paginate_eigon(dom):
next_url = dom.select_one('[rel=next]')
return (next_url != None, next_url.get('href', ''))
dom = get_dom(url)
result = run(dom)
arquivo = open('ecommerce.json', 'w')
arquivo.write(json.dumps(result, indent=2))
arquivo.close()