-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgbm_model.R
185 lines (125 loc) · 7.11 KB
/
gbm_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
library(quantmod)
library("lubridate")
library("e1071")
# get market data for all symbols making up the NASDAQ 100 Index
Stocks <- c("MAT","SNE","AAPL","AMGN","MAR")
getSymbols(Stocks,from = '2012-01-01', to = '2017-04-17' )
# merge them all together
stocks <- data.frame(as.xts(merge(AMGN,SNE,AAPL,MAT,MAR)))
head(stocks[,1:12],2)
# set outcome variable
outcomeSymbol <- 'MAR.Close'
#We are trying to determine whether the close price of 'Apple' stock is higher or
#lower than the current trading day
# shift outcome value to be on same line as predictors
library(xts)
stocks <- xts(stocks,order.by=as.Date(rownames(stocks)))
stocks <- as.data.frame(merge(stocks, lm1=lag(stocks[,outcomeSymbol],-1)))
stocks$outcome <- ifelse(stocks[,paste0(outcomeSymbol,'.1')] > stocks[,outcomeSymbol], 1, 0)
# We're trying to predict down one trading day using the lag function.
#This will add the close field of our outcome symbol with a lag of 1 trading day so it's
#on the same line as the predictors. We will rely on this value for training and testing purposes.
#A value of 1 means the close price went up, and a 0, that it went down:
#As we took the outcomeSymbol as close price of Apple stock we can interpret the output in terms of high and low where 1 being high and 0 being low.
# remove shifted down close field as we don't care by the value
stocks <- stocks[,!names(stocks) %in% c(paste0(outcomeSymbol,'.1'))]
head(stocks)
#Cast the date field to type date as it currently is of type character and sort by decreasing order
# cast date to true date and order in decreasing order
stocks$date <- as.Date(row.names(stocks))
stocks <- stocks[order(as.Date(stocks$date, "%m/%d/%Y"), decreasing = TRUE),]
#Here is the pattern maker function. This will take our raw market data and scale it so
#that we can compare any symbol with any other symbol.
#It then subtracts the different day ranges requested by the days parameter
#using the diff and lag calls and puts them all on the same row along with the outcome.
#To make things even more compatible, the roundByScaler parameter can round results.
# calculate all day differences and populate them on same row
Difference_in_Days <- function(objDF,days=c(10), offLimitsSymbols=c('outcome'), roundByScaler=3) {
# needs to be sorted by date in decreasing order
ind <- sapply(objDF, is.numeric)
for (sym in names(objDF)[ind]) {
if (!sym %in% offLimitsSymbols) {
print(paste('*********', sym))
objDF[,sym] <- round(scale(objDF[,sym]),roundByScaler)
print(paste('theColName', sym))
for (day in days) {
objDF[paste0(sym,'_',day)] <- c(diff(objDF[,sym],lag = day),rep(x=0,day)) * -1
}
}
}
return (objDF)
}
#Call the function with the following differences and scale it down to 2 decimal points (this takes a little while to run):
# call the function with the following differences
stocks <- Difference_in_Days(stocks, days=c(1,2,3,4,5,10,20), offLimitsSymbols=c('outcome'), roundByScaler=2)
#We delete the last row from our data frame as it doesn't have an outcome variable (that is in the future):
# drop most recent entry as we don't have an outcome
stocks <- stocks[2:nrow(stocks),]
#We extract the day of the week, day of the month, day of the year as predictors using POSIXlt:
# well use POSIXlt to add day of the week, day of the month, day of the year
stocks$wday <- as.POSIXlt(stocks$date)$wday
stocks$yday <- as.POSIXlt(stocks$date)$mday
stocks$mon<- as.POSIXlt(stocks$date)$mon
#Next we remove the date field as it won't help us as a predictor
#as they are all unique and we shuffle the data set using the sample function:
# remove date field and shuffle data frame
stocks <- subset(stocks, select=-c(date))
stocks <- stocks[sample(nrow(stocks)),]
# let's model
#We'll use a simple gbm model to get an AUC score.
#You can get more details regarding parameter settings for this model at the xgboost wiki:
library(caret)
#{caret} - modeling wrapper, functions, commands
predictorNames <- names(stocks)[names(stocks) != 'outcome']
head(predictorNames)
set.seed(1234)
split <- sample(nrow(stocks), floor(0.7*nrow(stocks)))
train <-stocks[split,]
test <- stocks[-split,]
# gbm supports both regression and classification.
#As this is a binary classification, we need to force gbm
#into using the classification mode. We do this by changing the
#outcome variable to a factor:
train$outcome <- ifelse(train$outcome==1,'yes','nope')
head(train$outcome)
#Caret offers many tuning functions to help you get as much as possible out of your models;
#the trainControl function allows you to control the resampling of your data.
#This will split the training data set internally and do it's own train/test runs
#to figure out the best settings for your model.
#In this case, we're going to cross-validate the data 2 times, therefore training it 2 times on different portions of the data before settling on the best tuning parameters (for gbm it is trees, shrinkage, and interaction depth).
#You can also set these values yourself if you don't trust the function.
# create caret trainControl object to control the number of cross-validations performed
objControl <- trainControl(method='cv', number=2, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)
#This is the heart of our modeling adventure, time to teach our model how to
#recognize closing price of Stocks. Because this is a classification model,
#we're requesting that our metrics use ROC instead of the default RMSE:
# run model
bst <- train(train[,predictorNames], as.factor(train$outcome),
method='gbm',
trControl=objControl,
metric = "ROC",
tuneGrid = expand.grid(n.trees = 5, interaction.depth = 3, shrinkage = 0.1, n.minobsinnode = 1)
)
print(bst)
#There are two types of evaluation we can do here, raw or prob.
#Raw gives you a class prediction, in our case yes and nope,
#while prob gives you the probability on how sure the model is about it's choice.
#I always use prob, as I like to be in control of the threshold and
#also like to use AUC score which requires probabilities, not classes.
#There are situations where having class values can come in handy,
#such as with multinomial models where you're predicting more than two values.
library(pROC)
#{pROC} - Area Under the Curve (AUC) functions
predictions <- predict(object=bst, train[,predictorNames], type='prob')
head(predictions)
auc <- auc(train$outcome,predictions[[2]])
print(paste('AUC score:', auc))
# AUC of ROC is a better measure than accuracy
# AUC as a criteria for comparing learning algorithms
# AUC of 0.649285290455624 for very little work (remember that an AUC ranges between 0.5 and 1,
#where 0.5 is random and 1 is perfect). Hopefully this will pique your
#imagination with the many possibilities of quantmod and R.
# AUC replaces accuracy when comparing classifiers
# Experimental results show AUC indicates a difference
#in performance between decision trees and Naive
#Bayes (significantly better)