-
-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathExtremeGradientBoosting
64 lines (54 loc) · 2.05 KB
/
ExtremeGradientBoosting
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Packages
library(xgboost)
library(magrittr)
library(dplyr)
library(Matrix)
# Data
data <- read.csv('https://raw.githubusercontent.com/bkrai/Statistical-Modeling-and-Graphs-with-R/main/binary.csv')
data$rank <- as.factor(data$rank)
# Partition data
set.seed(1234)
ind <- sample(2, nrow(data), replace = T, prob = c(0.8, 0.2))
train <- data[ind==1,]
test <- data[ind==2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(admit ~ .-1, data = train)
train_label <- train[,"admit"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(admit~.-1, data = test)
test_label <- test[,"admit"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm), label = test_label)
# Parameters
nc <- length(unique(train_label))
xgb_params <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = nc)
watchlist <- list(train = train_matrix, test = test_matrix)
# eXtreme Gradient Boosting Model
bst_model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = 1000,
watchlist = watchlist,
eta = 0.001,
max.depth = 3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
missing = NA,
seed = 333)
# Training & test error plot
e <- data.frame(bst_model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter, e$test_mlogloss, col = 'red')
min(e$test_mlogloss)
e[e$test_mlogloss == 0.625217,]
# Feature importance
imp <- xgb.importance(colnames(train_matrix), model = bst_model)
xgb.plot.importance(imp)
# Prediction & confusion matrix - test data
p <- predict(bst_model, newdata = test_matrix)
pred <- matrix(p, nrow = nc, ncol = length(p)/nc) %>%
t() %>%
data.frame() %>%
mutate(label = test_label, max_prob = max.col(., "last")-1)
table(Prediction = pred$max_prob, Actual = pred$label)