-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
92 lines (67 loc) · 2.63 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
This script takes a CSV of Suma 5-character codes and scrapes sumawholesale.com
for their pricing information, writing the results out as CSV.
The header of the input CSV should begin with the following columns. Any
subsequent columns will be ignored.
Code,Item,Item Price (ex VAT),Quant.,VAT rate
Products are identified solely by their code. The scraper will stop when it
encounters the first blank entry in the Code column.
The output file will be a copy of the input, with all columns except Code
overwritten with data retrieved from sumawholesale.com.
"""
import argparse
import logging
import sys
import re
import pandas as pd
from client import Suma
LOGGER = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
REQUIRED_HEADERS = [
'Code', 'Item', 'Item Price (ex VAT)', 'Quant.', 'VAT rate'
]
def main(input_file_path, output_file_path):
# load input file
LOGGER.info(f"read input from {input_file_path}")
df = pd.read_csv(input_file_path)
# check start of input headers match expected values
if REQUIRED_HEADERS != list(df.columns.values)[:len(REQUIRED_HEADERS)]:
raise ValueError(
f"{input_file_path} headers not as expected"
)
# get suma scraping client
S = Suma()
LOGGER.info("get product data")
for index, row in df.iterrows():
# e.g. AB123
code_pattern = r"[a-zA-Z]{2}\d{3}"
try:
if not re.search(code_pattern, row.Code):
# failed match returns None, which is falsy
# can't find a correctly formatted code
# assume we're at the end of the data and exit loop
continue
except TypeError:
# can't perform regex on cell contents
# probably nan value, so again, exit loop
continue
# fetch the data
data = S.get_product(row.Code)
# modify dataframe in-place
df.loc[index, 'Item'] = data['name']
df.loc[index, 'Item Price (ex VAT)'] = data['price']
# N.B. scraper returns percentage, by convention CSV uses decimal
df.loc[index, 'VAT rate'] = data['currentTax'] / 100
LOGGER.info((data['name']))
# write out to file
LOGGER.info(f"write output to {output_file_path}")
# do not include row index
df.to_csv(output_file_path, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Scrape Suma product information.'
)
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
main(args.input_file, args.output_file)