-
Notifications
You must be signed in to change notification settings - Fork 0
/
cigar_list_csv.py
executable file
·128 lines (112 loc) · 4.11 KB
/
cigar_list_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# cigar_list.py
# This is a simple scraper, no multi threading or async operations
# This builds a CSV file of all cigars scrapped from specific list of web pages (per the pages[] list in the code) by: brand, name, & size
# Running the script multiple times will build on the CSV list and not create duplicates.
# I run this daily to compile a list of all cigars that ever show up on the website.
import requests
import bs4
from datetime import datetime
import csv
import itertools
import sys
import os
import time
start_time = time.time()
timestamp = datetime.now().strftime("%m/%d/%Y %H:%M")
def cleanup(x):
"""
Returns a string with space on the left and right removed.
"""
try:
x = x.replace("Length (in inches):", "L:")
x = x.replace("LENGTH:", "L:")
x = x.replace("ring gauge:", "RG:")
x = x.replace("Ring Gauge:", "RG:")
x = x.replace("RING GAUGE:", "RG:")
x = x.replace("No Discounts Apply", "")
x = x.replace("no discounts apply", "")
x = ' '.join([line.strip() for line in x.strip().splitlines()])
except:
x = 'cleanup function error!'
return x
def number_of_lines(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
# START
file_name = os.path.join(sys.path[0], 'cigar_list.csv')
rows = []
rows.append(['Brand', 'Name', 'Size'])
# CoH
base_url = "https://www.cigarsofhabanos.com/"
pages = ['cigars-bolivar',
'cigars-cohiba',
'cigars-cuaba',
'cigars-diplomaticos',
'cigars-el-rey-del-mundo',
'cigars-fonseca',
'cigars-guantanamera',
'cigars-h.upmann',
'cigars-hoyo-de-monterrey',
'cigars-jose-l.-piedra',
'cigars-juan-lopez',
'cigars-la-flor-de-cano',
'cigars-la-gloria-cubana',
'cigars-montecristo',
'cigars-partagas',
'cigars-por-larranaga',
'cigars-punch',
'cigars-quai-dorsay',
'cigars-quintero-y-hermano',
'cigars-rafael-gonzalez',
'cigars-ramon-allones',
'cigars-romeo-y-julieta',
'cigars-saint-luis-rey',
'cigars-san-cristobal-de-la-habana',
'cigars-sancho-panza',
'cigars-trinidad',
'cigars-vegas-robaina',
'cigars-vegueros']
for page in pages:
url = f'{base_url}{page}'
brand = page.split('-', 1)[1].replace('-', ' ')
try:
# get the page
response = requests.get(url)
# Save the response to soup
soup = bs4.BeautifulSoup(response.text, "html.parser")
# Get number of items
total_product_tags = len(soup.find_all('span', {'class': ['product_header', 'product_header_W']}))
# Iterate through all products
for i in range (0, total_product_tags):
name = soup.find_all('span', {'class': ['product_header', 'product_header_W']})[i].get_text().strip()
size = cleanup(soup.find_all('td', class_='nortxt')[i].find('div', class_='fsize11').get_text())
rows.append([brand, name, size])
except Exception as e:
#this website didn't respond or the page had errors
print(e)
# Open existing file for comparison
try:
with open(file_name, 'r', newline='', encoding='ISO-8859-1') as existing_file:
reader = csv.reader(existing_file)
existing_rows = list(reader)
except:
#file does not exist
existing_rows = []
# Combine the existing rows with what was just scrapped, sort, then remove dups
combined_rows = existing_rows + rows
combined_rows.sort()
unique_rows = list(combined_rows for combined_rows,_ in itertools.groupby(combined_rows))
# Create new file with combined results
with open(file_name, 'w', newline='', encoding='ISO-8859-1') as csvfile:
writer = csv.writer(csvfile)
for row in unique_rows:
writer.writerow(row)
total_lines = number_of_lines(file_name)
runtime = round(time.time() - start_time, 2)
print(f'File created: {timestamp} with {total_lines} lines - Runtime: {runtime}')