-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeTocsv.py
146 lines (127 loc) · 4.89 KB
/
scrapeTocsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from colorama import Fore, Style
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
import pprint
pp = pprint.PrettyPrinter(indent=4)
# [DEBUG SELECTOR]
# WRITE = False
WRITE = True
def debug(argument: str, clearOnNew: bool = True):
if WRITE:
if os.path.exists("debug.txt"):
if(clearOnNew):
with open('debug.txt', 'w', encoding="utf-8") as debug:
debug.truncate(0)
debug.write(str(argument))
else:
with open('debug.txt', 'a', encoding="utf-8") as debug:
debug.write(str(argument))
debug.close()
def convertToCsv(data):
df = pd.DataFrame(data=data)
df.to_csv('history_of_ferrari.csv', sep=',', header=True, index=False)
print(f"{Fore.GREEN}"+"Successfully converted to csv."+f"{Style.RESET_ALL}")
data = dict()
def addToData(row, data=data):
for spec in data:
if spec not in row:
data[spec].append(None)
if "model" not in data:
rowCount = 0
else:
rowCount = len(data["model"])
for spec in row:
if spec not in data:
data[spec] = (rowCount) * [None]
data[spec].append(row[spec])
skiplist = ["166 MM"] # cdn unable to deliver jpgs
skipSkipList = False
history_home_url = 'https://www.ferrari.com/en-EN/auto/'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.get(history_home_url + 'past-model')
time.sleep(3)
page_source = driver.page_source
souped_History_home = BeautifulSoup(page_source, 'html.parser')
historySections = souped_History_home.find_all(
'div', class_='PastModels__section__2TwZvTPv')
# [Print to text controls]
allowAll = True
stop = 0
condition = 5
for year in historySections:
souped_year = BeautifulSoup(str(year), 'html.parser')
yearOf = souped_year.find(
'h2', class_='PastModels__sectionYear__33cNPh9I').get_text()
modelsByYear = souped_year.find_all('a')
for model in modelsByYear:
row = dict()
model_url = model.get('href')
name_plate = BeautifulSoup(str(model), 'html.parser')
name_parts = name_plate.find(
'span', class_="PastModels__text__2qL1mq9T")
name_parts = BeautifulSoup(str(name_parts), 'html.parser')
name_parts = name_parts.findAll('span')
model_name = name_parts[1].text
model_type = name_parts[2].text
if model_name in skiplist and skipSkipList:
continue
row["model"] = model_name
row["year"] = yearOf
print(f"{Fore.GREEN}"+yearOf + " - " + model_name+f"{Style.RESET_ALL}")
driver.get(history_home_url + model_url)
model_source = driver.page_source
souped_model = BeautifulSoup(model_source, 'html.parser')
available = souped_model.find('div', class_='main')
if available:
print(f"{Fore.RED}Model page not available{Style.RESET_ALL}")
continue
trivia = souped_model.find(
'div', class_='Intro__text__2JBv1kY9')
if trivia is None:
trivia = souped_model.find(
'div', class_='Editorial__desc__20EN5mi7').get_text()
else:
trivia = trivia.get_text()
if trivia is None:
trivia = ""
model_speclist = souped_model.find(
'div', class_='TechSpecs__list__1_NWtTPS')
if stop == condition and not allowAll:
exit()
else:
row["trivia"] = trivia
for speclist_property in model_speclist:
if "note" in speclist_property:
continue
souped_spec = BeautifulSoup(
str(speclist_property), 'html.parser')
sectionName = souped_spec.find(
'div', class_='Accordion__title--body-alt__3AKQP6Lg').get_text()
specs = [spec.get_text()
for spec in souped_spec.findAll('strong')]
specs_values = [spec_val.get_text(" ") for spec_val in souped_spec.findAll(
'span', class_="TechSpecs__value__1wW_OIzf")]
spec_list = zip(specs, specs_values)
for spec, value in spec_list:
spec = spec.replace(" km/h", "").replace(" m", "")
while(spec.find(" ") != -1):
spec = spec.replace(
"***", " ").replace("**", " ").replace("*", " ").replace(" ", " ")
spec = spec.lstrip().rstrip()
if spec.lower()[0:4] == "note":
continue
row[spec.lower()] = value.strip()
addToData(row)
stop += 1
convertToCsv(data)
pp.pprint(data)
debug(data, False)
debug(data.keys(), False)
driver.quit()