-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrp_yfinance.py
338 lines (245 loc) · 11.2 KB
/
scrp_yfinance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# '''
# In this notebook we will be looking at data from the stock market,
# particularly some technology stocks. We will learn how to use pandas
# to get stock information, visualize different aspects of it, and
# finally we will look at a few ways of analyzing the risk of a stock,
# based on its previous performance history. We will also be predicting
# future stock prices through a Long Short Term Memory (LSTM) method!
# We'll be answering the following questions along the way:
# 1.) What was the change in price of the stock over time?
# 2.) What was the daily return of the stock on average?
# 3.) What was the moving average of the various stocks?
# 4.) What was the correlation between different stocks'?
# 5.) How much value do we put at risk by investing in a particular stock?
# 6.) How can we attempt to predict future stock behavior? (Predicting the closing price stock price of APPLE inc using LSTM)
# '''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
#%matplotlib inline
# For reading stock data from yahoo
from pandas_datareader.data import DataReader
# For time stamps
from datetime import datetime
# The tech stocks we'll use for this analysis
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN',"FB","F","TSLA","BBC","TSM","INTC","AMD","AZN"]
# Set up End and Start times for data grab
end = datetime.now()
#change value of end.year '- number' will give you previous year
start = datetime(end.year - 15, end.month, end.day)
#For loop for grabing yahoo finance data and setting as a dataframe
for stock in tech_list:
# Set DataFrame as the Stock Ticker
globals()[stock] = DataReader(stock, 'yahoo', start, end)
company_list = [AAPL, GOOG, MSFT, AMZN,FB,F,TSLA,BBC,TSM,INTC,AMD,AZN]
company_name = ["APPLE", "GOOGLE", "MICROSOFT", "AMAZON","FaceBook","Ford motors","Tesla","Biontech","TSMC semiconductor","Intel","Advace micro device","Astrazeneca PLC"]
for company, com_name in zip(company_list, company_name):
company["company_name"] = com_name
df = pd.concat(company_list, axis=0)
#d=df.tail(10)
df.to_csv('stock.csv')
#df.to_excel('stock.xlsx')
#print(df)
# AAPL.describe()
# AAPL.info()
# # Let's see a historical view of the closing price
# plt.figure(figsize=(15, 6))
# plt.subplots_adjust(top=1.25, bottom=1.2)
# for i, company in enumerate(company_list, 1):
# plt.subplot(2, 2, i)
# company['Adj Close'].plot()
# plt.ylabel('Adj Close')
# plt.xlabel(None)
# plt.title(f"Closing Price of {tech_list[i - 1]}")
# plt.tight_layout()
# # Now let's plot the total volume of stock being traded each day
# plt.figure(figsize=(15, 7))
# plt.subplots_adjust(top=1.25, bottom=1.2)
# for i, company in enumerate(company_list, 1):
# plt.subplot(2, 2, i)
# company['Volume'].plot()
# plt.ylabel('Volume')
# plt.xlabel(None)
# plt.title(f"Sales Volume for {tech_list[i - 1]}")
# plt.tight_layout()
# ma_day = [10, 20, 50]
# for ma in ma_day:
# for company in company_list:
# column_name = f"MA for {ma} days"
# company[column_name] = company['Adj Close'].rolling(ma).mean()
# fig, axes = plt.subplots(nrows=2, ncols=2)
# fig.set_figheight(8)
# fig.set_figwidth(15)
# AAPL[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,0])
# axes[0,0].set_title('APPLE')
# GOOG[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,1])
# axes[0,1].set_title('GOOGLE')
# MSFT[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,0])
# axes[1,0].set_title('MICROSOFT')
# AMZN[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,1])
# axes[1,1].set_title('AMAZON')
# fig.tight_layout()
# # We'll use pct_change to find the percent change for each day
# for company in company_list:
# company['Daily Return'] = company['Adj Close'].pct_change()
# # Then we'll plot the daily return percentage
# fig, axes = plt.subplots(nrows=2, ncols=2)
# fig.set_figheight(8)
# fig.set_figwidth(15)
# AAPL['Daily Return'].plot(ax=axes[0,0], legend=True, linestyle='--', marker='o')
# axes[0,0].set_title('APPLE')
# GOOG['Daily Return'].plot(ax=axes[0,1], legend=True, linestyle='--', marker='o')
# axes[0,1].set_title('GOOGLE')
# MSFT['Daily Return'].plot(ax=axes[1,0], legend=True, linestyle='--', marker='o')
# axes[1,0].set_title('MICROSOFT')
# AMZN['Daily Return'].plot(ax=axes[1,1], legend=True, linestyle='--', marker='o')
# axes[1,1].set_title('AMAZON')
# fig.tight_layout()
# # Note the use of dropna() here, otherwise the NaN values can't be read by seaborn
# plt.figure(figsize=(12, 7))
# for i, company in enumerate(company_list, 1):
# plt.subplot(2, 2, i)
# sns.distplot(company['Daily Return'].dropna(), bins=100, color='purple')
# plt.ylabel('Daily Return')
# plt.title(f'{company_name[i - 1]}')
# # Could have also done:
# #AAPL['Daily Return'].hist()
# plt.tight_layout()
# # Grab all the closing prices for the tech stock list into one DataFrame
# closing_df = DataReader(tech_list, 'yahoo', start, end)['Adj Close']
# # Let's take a quick look
# closing_df.head()
# # Make a new tech returns DataFrame
# tech_rets = closing_df.pct_change()
# tech_rets.head()
# # Comparing Google to itself should show a perfectly linear relationship
# sns.jointplot('GOOG', 'GOOG', tech_rets, kind='scatter', color='seagreen')
# # We'll use joinplot to compare the daily returns of Google and Microsoft
# sns.jointplot('GOOG', 'MSFT', tech_rets, kind='scatter')
# # We can simply call pairplot on our DataFrame for an automatic visual analysis
# # of all the comparisons
# sns.pairplot(tech_rets, kind='reg')
# # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame
# return_fig = sns.PairGrid(tech_rets.dropna())
# # Using map_upper we can specify what the upper triangle will look like.
# return_fig.map_upper(plt.scatter, color='purple')
# # We can also define the lower triangle in the figure, inclufing the plot type (kde)
# # or the color map (BluePurple)
# return_fig.map_lower(sns.kdeplot, cmap='cool_d')
# # Finally we'll define the diagonal as a series of histogram plots of the daily return
# return_fig.map_diag(plt.hist, bins=30)
# # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame
# returns_fig = sns.PairGrid(closing_df)
# # Using map_upper we can specify what the upper triangle will look like.
# returns_fig.map_upper(plt.scatter,color='purple')
# # We can also define the lower triangle in the figure, inclufing the plot type (kde) or the color map (BluePurple)
# returns_fig.map_lower(sns.kdeplot,cmap='cool_d')
# # Finally we'll define the diagonal as a series of histogram plots of the daily return
# returns_fig.map_diag(plt.hist,bins=30)
# # Let's go ahead and use sebron for a quick correlation plot for the daily returns
# sns.heatmap(tech_rets.corr(), annot=True, cmap='summer')
# sns.heatmap(closing_df.corr(), annot=True, cmap='summer')
# # Let's start by defining a new DataFrame as a clenaed version of the oriignal tech_rets DataFrame
# rets = tech_rets.dropna()
# area = np.pi * 20
# plt.figure(figsize=(10, 7))
# plt.scatter(rets.mean(), rets.std(), s=area)
# plt.xlabel('Expected return')
# plt.ylabel('Risk')
# for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
# plt.annotate(label, xy=(x, y), xytext=(50, 50), textcoords='offset points', ha='right', va='bottom',
# arrowprops=dict(arrowstyle='-', color='blue', connectionstyle='arc3,rad=-0.3'))
# # Get the stock quote
# df = DataReader('AAPL', data_source='yahoo', start='2012-01-01', end=datetime.now())
# # Show teh data
# print(df)
# plt.figure(figsize=(16,6))
# plt.title('Close Price History')
# plt.plot(df['Close'])
# plt.xlabel('Date', fontsize=18)
# plt.ylabel('Close Price USD ($)', fontsize=18)
# plt.show()
# # Create a new dataframe with only the 'Close column
# data = df.filter(['Close'])
# # Convert the dataframe to a numpy array
# dataset = data.values
# # Get the number of rows to train the model on
# training_data_len = int(np.ceil( len(dataset) * .95 ))
# training_data_len
# # Scale the data
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(0,1))
# scaled_data = scaler.fit_transform(dataset)
# scaled_data
# ####################################################
# # Create the training data set
# # Create the scaled training data set
# train_data = scaled_data[0:int(training_data_len), :]
# # Split the data into x_train and y_train data sets
# x_train = []
# y_train = []
# for i in range(60, len(train_data)):
# x_train.append(train_data[i-60:i, 0])
# y_train.append(train_data[i, 0])
# if i<= 61:
# print(x_train)
# print(y_train)
# print()
# # Convert the x_train and y_train to numpy arrays
# x_train, y_train = np.array(x_train), np.array(y_train)
# # Reshape the data
# x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# # x_train.shape
# ######################################################################
# from keras.models import Sequential
# from keras.layers import Dense, LSTM
# # Build the LSTM model
# model = Sequential()
# model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
# model.add(LSTM(64, return_sequences=False))
# model.add(Dense(25))
# model.add(Dense(1))
# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')
# # Train the model
# model.fit(x_train, y_train, batch_size=1, epochs=1)
# ##########################################################################
# # Create the testing data set
# # Create a new array containing scaled values from index 1543 to 2002
# test_data = scaled_data[training_data_len - 60: , :]
# # Create the data sets x_test and y_test
# x_test = []
# y_test = dataset[training_data_len:, :]
# for i in range(60, len(test_data)):
# x_test.append(test_data[i-60:i, 0])
# # Convert the data to a numpy array
# x_test = np.array(x_test)
# # Reshape the data
# x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))
# # Get the models predicted price values
# predictions = model.predict(x_test)
# predictions = scaler.inverse_transform(predictions)
# # Get the root mean squared error (RMSE)
# rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
# rmse
# ###########################################################################
# # Plot the data
# train = data[:training_data_len]
# valid = data[training_data_len:]
# valid['Predictions'] = predictions
# # Visualize the data
# plt.figure(figsize=(16,6))
# plt.title('Model')
# plt.xlabel('Date', fontsize=18)
# plt.ylabel('Close Price USD ($)', fontsize=18)
# plt.plot(train['Close'])
# plt.plot(valid[['Close', 'Predictions']])
# plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
# plt.show()
# #########################################################################
# # Show the valid and predicted prices
# valid
# print(valid)