scrp_yfinance.py

# '''


# In this notebook we will be looking at data from the stock market, 
# particularly some technology stocks. We will learn how to use pandas
#  to get stock information, visualize different aspects of it, and 
# finally we will look at a few ways of analyzing the risk of a stock, 
# based on its previous performance history. We will also be predicting 
# future stock prices through a Long Short Term Memory (LSTM) method!

# We'll be answering the following questions along the way:

# 1.) What was the change in price of the stock over time?
# 2.) What was the daily return of the stock on average?
# 3.) What was the moving average of the various stocks?
# 4.) What was the correlation between different stocks'?
# 5.) How much value do we put at risk by investing in a particular stock?
# 6.) How can we attempt to predict future stock behavior? (Predicting the closing price stock price of APPLE inc using LSTM)

# '''

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")

#%matplotlib inline

# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# For time stamps
from datetime import datetime

# The tech stocks we'll use for this analysis
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN',"FB","F","TSLA","BBC","TSM","INTC","AMD","AZN"]

# Set up End and Start times for data grab
end = datetime.now()
#change value of end.year '- number' will give you previous year 
start = datetime(end.year - 15, end.month, end.day)


#For loop for grabing yahoo finance data and setting as a dataframe
for stock in tech_list:   
    # Set DataFrame as the Stock Ticker
    globals()[stock] = DataReader(stock, 'yahoo', start, end)

company_list = [AAPL, GOOG, MSFT, AMZN,FB,F,TSLA,BBC,TSM,INTC,AMD,AZN]
company_name = ["APPLE", "GOOGLE", "MICROSOFT", "AMAZON","FaceBook","Ford motors","Tesla","Biontech","TSMC semiconductor","Intel","Advace micro device","Astrazeneca PLC"]

for company, com_name in zip(company_list, company_name):
    company["company_name"] = com_name
    
df = pd.concat(company_list, axis=0)
#d=df.tail(10)
df.to_csv('stock.csv')
#df.to_excel('stock.xlsx')
#print(df)
# AAPL.describe()

# AAPL.info()

# # Let's see a historical view of the closing price


# plt.figure(figsize=(15, 6))
# plt.subplots_adjust(top=1.25, bottom=1.2)

# for i, company in enumerate(company_list, 1):
#     plt.subplot(2, 2, i)
#     company['Adj Close'].plot()
#     plt.ylabel('Adj Close')
#     plt.xlabel(None)
#     plt.title(f"Closing Price of {tech_list[i - 1]}")
    
# plt.tight_layout()

# # Now let's plot the total volume of stock being traded each day
# plt.figure(figsize=(15, 7))
# plt.subplots_adjust(top=1.25, bottom=1.2)

# for i, company in enumerate(company_list, 1):
#     plt.subplot(2, 2, i)
#     company['Volume'].plot()
#     plt.ylabel('Volume')
#     plt.xlabel(None)
#     plt.title(f"Sales Volume for {tech_list[i - 1]}")
    
# plt.tight_layout()

# ma_day = [10, 20, 50]

# for ma in ma_day:
#     for company in company_list:
#         column_name = f"MA for {ma} days"
#         company[column_name] = company['Adj Close'].rolling(ma).mean()


# fig, axes = plt.subplots(nrows=2, ncols=2)
# fig.set_figheight(8)
# fig.set_figwidth(15)

# AAPL[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,0])
# axes[0,0].set_title('APPLE')

# GOOG[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,1])
# axes[0,1].set_title('GOOGLE')

# MSFT[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,0])
# axes[1,0].set_title('MICROSOFT')

# AMZN[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,1])
# axes[1,1].set_title('AMAZON')

# fig.tight_layout()

# # We'll use pct_change to find the percent change for each day
# for company in company_list:
#     company['Daily Return'] = company['Adj Close'].pct_change()

# # Then we'll plot the daily return percentage
# fig, axes = plt.subplots(nrows=2, ncols=2)
# fig.set_figheight(8)
# fig.set_figwidth(15)

# AAPL['Daily Return'].plot(ax=axes[0,0], legend=True, linestyle='--', marker='o')
# axes[0,0].set_title('APPLE')

# GOOG['Daily Return'].plot(ax=axes[0,1], legend=True, linestyle='--', marker='o')
# axes[0,1].set_title('GOOGLE')

# MSFT['Daily Return'].plot(ax=axes[1,0], legend=True, linestyle='--', marker='o')
# axes[1,0].set_title('MICROSOFT')

# AMZN['Daily Return'].plot(ax=axes[1,1], legend=True, linestyle='--', marker='o')
# axes[1,1].set_title('AMAZON')

# fig.tight_layout()

# # Note the use of dropna() here, otherwise the NaN values can't be read by seaborn
# plt.figure(figsize=(12, 7))

# for i, company in enumerate(company_list, 1):
#     plt.subplot(2, 2, i)
#     sns.distplot(company['Daily Return'].dropna(), bins=100, color='purple')
#     plt.ylabel('Daily Return')
#     plt.title(f'{company_name[i - 1]}')
# # Could have also done:
# #AAPL['Daily Return'].hist()
# plt.tight_layout()

# # Grab all the closing prices for the tech stock list into one DataFrame
# closing_df = DataReader(tech_list, 'yahoo', start, end)['Adj Close']

# # Let's take a quick look
# closing_df.head() 

# # Make a new tech returns DataFrame
# tech_rets = closing_df.pct_change()
# tech_rets.head()

# # Comparing Google to itself should show a perfectly linear relationship
# sns.jointplot('GOOG', 'GOOG', tech_rets, kind='scatter', color='seagreen')

# # We'll use joinplot to compare the daily returns of Google and Microsoft
# sns.jointplot('GOOG', 'MSFT', tech_rets, kind='scatter')

# # We can simply call pairplot on our DataFrame for an automatic visual analysis 
# # of all the comparisons

# sns.pairplot(tech_rets, kind='reg')


# # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame
# return_fig = sns.PairGrid(tech_rets.dropna())

# # Using map_upper we can specify what the upper triangle will look like.
# return_fig.map_upper(plt.scatter, color='purple')

# # We can also define the lower triangle in the figure, inclufing the plot type (kde) 
# # or the color map (BluePurple)
# return_fig.map_lower(sns.kdeplot, cmap='cool_d')

# # Finally we'll define the diagonal as a series of histogram plots of the daily return
# return_fig.map_diag(plt.hist, bins=30)

# # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame
# returns_fig = sns.PairGrid(closing_df)

# # Using map_upper we can specify what the upper triangle will look like.
# returns_fig.map_upper(plt.scatter,color='purple')

# # We can also define the lower triangle in the figure, inclufing the plot type (kde) or the color map (BluePurple)
# returns_fig.map_lower(sns.kdeplot,cmap='cool_d')

# # Finally we'll define the diagonal as a series of histogram plots of the daily return
# returns_fig.map_diag(plt.hist,bins=30)

# # Let's go ahead and use sebron for a quick correlation plot for the daily returns
# sns.heatmap(tech_rets.corr(), annot=True, cmap='summer')

# sns.heatmap(closing_df.corr(), annot=True, cmap='summer')


# # Let's start by defining a new DataFrame as a clenaed version of the oriignal tech_rets DataFrame
# rets = tech_rets.dropna()

# area = np.pi * 20

# plt.figure(figsize=(10, 7))
# plt.scatter(rets.mean(), rets.std(), s=area)
# plt.xlabel('Expected return')
# plt.ylabel('Risk')

# for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
#     plt.annotate(label, xy=(x, y), xytext=(50, 50), textcoords='offset points', ha='right', va='bottom', 
#                  arrowprops=dict(arrowstyle='-', color='blue', connectionstyle='arc3,rad=-0.3'))


# # Get the stock quote
# df = DataReader('AAPL', data_source='yahoo', start='2012-01-01', end=datetime.now())
# # Show teh data
# print(df)

# plt.figure(figsize=(16,6))
# plt.title('Close Price History')
# plt.plot(df['Close'])
# plt.xlabel('Date', fontsize=18)
# plt.ylabel('Close Price USD ($)', fontsize=18)
# plt.show()


# # Create a new dataframe with only the 'Close column 
# data = df.filter(['Close'])
# # Convert the dataframe to a numpy array
# dataset = data.values
# # Get the number of rows to train the model on
# training_data_len = int(np.ceil( len(dataset) * .95 ))

# training_data_len

# # Scale the data
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler(feature_range=(0,1))
# scaled_data = scaler.fit_transform(dataset)

# scaled_data

# ####################################################
# # Create the training data set 
# # Create the scaled training data set
# train_data = scaled_data[0:int(training_data_len), :]
# # Split the data into x_train and y_train data sets
# x_train = []
# y_train = []

# for i in range(60, len(train_data)):
#     x_train.append(train_data[i-60:i, 0])
#     y_train.append(train_data[i, 0])
#     if i<= 61:
#         print(x_train)
#         print(y_train)
#         print()
        
# # Convert the x_train and y_train to numpy arrays 
# x_train, y_train = np.array(x_train), np.array(y_train)

# # Reshape the data
# x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# # x_train.shape


# ######################################################################

# from keras.models import Sequential
# from keras.layers import Dense, LSTM

# # Build the LSTM model
# model = Sequential()
# model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
# model.add(LSTM(64, return_sequences=False))
# model.add(Dense(25))
# model.add(Dense(1))

# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')

# # Train the model
# model.fit(x_train, y_train, batch_size=1, epochs=1)
# ##########################################################################
# # Create the testing data set
# # Create a new array containing scaled values from index 1543 to 2002 
# test_data = scaled_data[training_data_len - 60: , :]
# # Create the data sets x_test and y_test
# x_test = []
# y_test = dataset[training_data_len:, :]
# for i in range(60, len(test_data)):
#     x_test.append(test_data[i-60:i, 0])
    
# # Convert the data to a numpy array
# x_test = np.array(x_test)

# # Reshape the data
# x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))

# # Get the models predicted price values 
# predictions = model.predict(x_test)
# predictions = scaler.inverse_transform(predictions)

# # Get the root mean squared error (RMSE)
# rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
# rmse
# ###########################################################################

# # Plot the data
# train = data[:training_data_len]
# valid = data[training_data_len:]
# valid['Predictions'] = predictions
# # Visualize the data
# plt.figure(figsize=(16,6))
# plt.title('Model')
# plt.xlabel('Date', fontsize=18)
# plt.ylabel('Close Price USD ($)', fontsize=18)
# plt.plot(train['Close'])
# plt.plot(valid[['Close', 'Predictions']])
# plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
# plt.show()
# #########################################################################

# # Show the valid and predicted prices
# valid

# print(valid)