-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclient.py
207 lines (155 loc) · 6.38 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import logging
import re
import ssl
from urllib.parse import urljoin, urlparse
from typing import Optional, Callable, Sequence
from dataclasses import dataclass
import requests
from requests import adapters
from urllib3 import poolmanager
from bs4 import BeautifulSoup
LOGGER = logging.getLogger(__name__)
class TLSAdapter(adapters.HTTPAdapter):
"""
Suma uses an out-dated set of ciphers for agreeing a connection. Downgrade
the security level to permit usage of older, less secure ciphers when using
this adapter.
"""
def init_poolmanager(self, connections, maxsize, block=False):
"""Create and initialize the urllib3 PoolManager."""
ctx = ssl.create_default_context()
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
self.poolmanager = poolmanager.PoolManager(
num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLS,
ssl_context=ctx)
@dataclass
class ProductAttribute:
"""Container for product attribute information."""
name: str
pattern: str
converter: Callable
class Suma:
"""
Scraper for SumaWholesale.com
"""
def __init__(self):
# use TLS, otherwise redirected to home
self.base_url = "https://www.sumawholesale.com/"
self.session = requests.Session()
self.session.mount(self.base_url, TLSAdapter())
def get_product(self, code: str) -> dict:
"""Externally facing method for getting product data."""
path, name = self._get_product_path_and_name(code)
data = self._get_product_pricing(path)
data['name'] = name
return data
def _request(self, method: str, path: str, data: dict = None):
"""
Main contact point for requests. Constructs URL, makes request and
checks response status.
Args:
method (str): HTTP method name
path (str): Path relative to host to request
data (dict): Data for PUT, PATCH & POST
"""
# complete URL
url = urljoin(self.base_url, path)
# was running into problems with a lack of root certificates
# N.B. certificates are a chain of trust, at the base are parent certs
# these typically come from a trusted authority like DigiCert
# OS may be missing these certs, can install some with certifi
# and then point requests/urllib to them if need be
# import certifi
# session.request(..., verify=certifi.where())
response = self.session.request(method, url, data=data)
# raise exception if request unsuccessful
response.raise_for_status()
return response
def _get_product_pricing(self, path: str) -> dict:
"""
Given product page path, return relevant product info
Args:
path (str): Relative path of product page
Returns:
dict: attribute name and value of data
"""
# get text to search for data
string = self._get_text(path)
data = {}
# attr is instance of ProductAttribute dataclass
for attr in PRODUCT_ATTRS:
try:
# look for attribute pattern in string
regex = re.search(attr.pattern, string)
value: str = regex.groups()[0]
# convert to desired type and store
data[attr.name] = attr.converter(value)
except AttributeError:
LOGGER.warning(
"failed to find attr {} w/ {} on page {}".format(
attr.name, attr.pattern, path
)
)
raise # die
return data
def _get_text(self, path: str) -> str:
"""Get text to extract product information from.
Args:
path (str): Product page relative path.
Returns:
str: Text containing product data"""
product_page_html = self._request('GET', path).content
# parse html for product page data
soup = BeautifulSoup(product_page_html, features="html.parser")
# product info visible when logged in is stored in javascript
main_div = soup.body.find(
'div', attrs={'class': 'col-main', 'id': 'main'}
)
script = main_div.find('script', attrs={'type': 'text/javascript'})
return script.string
def _get_product_path_and_name(self, code: str) -> Optional[Sequence]:
"""
Given a product code, return product page URL and name
Args:
code (str): 5 character alphanumeric product code
Returns:
str, str: Relative path of product page and product name or None if
path or name not found
"""
# path relative to root
path = f"catalogsearch/result/?q={code}"
# _request will construct full URL
search_pg_html = self._request('GET', path).content
# parse html for link to product page
soup = BeautifulSoup(search_pg_html, features="html.parser")
try:
# listings, should only contain one product having searched by code
listings = soup.body.find(
'div', attrs={'class': 'listing-type-grid catalog-listing'}
)
# get the product from the listings list (li)
product = listings.find('li', attrs={'class': 'item'})
except AttributeError:
# maybe tried to run find on None type returned by previous .find
raise ValueError(f"failed to get data for {code} -- is it valid?")
# extract the link from the product item, or none if not available
url = product.find('a').get('href')
# get name of product too, cutting leading and trailing whitespace
name = product.find('a').get('title').strip()
# return relative path
return urlparse(url).path, name
# to extract data from selected text (currently JS), register a regex here
PRODUCT_ATTRS = [
ProductAttribute(*data) for data in (
# ex vat price
# decimal optional in regex, some prices are ints
('price', r'\"productPrice\":(\d+\.?\d{0,2})', float),
# tax rate as a percentage, take max 2 d.p.
('currentTax', r'\"currentTax\":(\d+\.?\d{0,2})', float),
# is tax payable? redundant courtesy of currentTax
# ('includeTax', r'\"includeTax\":\"(\w+)\",', bool),
)
]