-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgb_train.R
123 lines (104 loc) · 3.85 KB
/
xgb_train.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Trains XGBoost classification model for ecological integrity estimation
library('tidyverse')
library('xgboost')
library('fastDummies')
library('caret')
library('hardhat')
set.seed <- 1
# ======================Input============================
input_folder <- 'data/model_input/slic/2017'
output_folder <- 'output/models/xgb slic v11'
categorical_variables <- c('holdridge',
'land_cover')
# remove_variable <- c('edge_distance')
remove_variable <- c('')
# coordinate_variables <- c('x','y')
coordinate_variables <- c('ID') # if SLIC is used
# ==================Processing data======================
df <- list.files(input_folder, "csv$", full.names = TRUE) %>%
map_dfr(read_csv)
df <- df %>%
select_if(!names(.) %in% remove_variable) %>%
drop_na() %>%
mutate(across(all_of(c('hemerobia',
categorical_variables)),
as.factor))
# Add missing level in land cover
levels(df$land_cover) <- c(levels(df$land_cover),3)
df$land_cover <- factor(df$land_cover,
levels=1:17)
# Create dummies for categorical
df <- dummy_cols(df, select_columns = categorical_variables)
# Create partition stratified by holdridge
# 70% for training and 30% for testing
train_index <- createDataPartition(df$holdridge, p = .7, list = FALSE)
# Save csv with coordinates and indicator
# (1=training data point, 0=testing data point)
df_is_train <- df %>%
select(all_of(coordinate_variables))
df_is_train$is_train <- 0
df_is_train[train_index[,1],'is_train'] <- 1
write.csv(df_is_train,
paste0(output_folder,'/is_train.csv'),
row.names = FALSE)
# Split in training and testing
df_train <- df[train_index,]
df_test <- df[-train_index,]
rm(df)
rm(train_index)
# Transform the two data sets into xgb.Matrix
xgb.train <- xgb.DMatrix(data=as.matrix(df_train %>%
select(-c(coordinate_variables,
'hemerobia',
categorical_variables))),
label=as.integer(df_train$hemerobia)-1)
xgb.test <- xgb.DMatrix(data=as.matrix(df_test %>%
select(-c(coordinate_variables,
'hemerobia',
categorical_variables))),
label=as.integer(df_test$hemerobia)-1)
# ======================Training model============================
# Define the parameters
params <- list(
booster="gbtree",
objective="multi:softprob",
eta=0.3,
gamma=0,
max_depth=10,
min_child_weight=1,
subsample=1,
colsample_bytree=0.7,
eval_metric="merror",
num_class=length(levels(df_train$hemerobia))
)
# Train the XGBoost classifer
xgb.fit <- xgb.train(
params=params,
data=xgb.train,
nrounds=2000,
early_stopping_rounds=10,
watchlist=list(train=xgb.train,test=xgb.test),
verbose=2
)
# =====================Saving output===========================
# Save model
xgb.save(xgb.fit, paste0(output_folder,'/xgb.fit'))
# Save list of input variables
write.csv(colnames(xgb.train),
paste0(output_folder,'/variables_list.csv'),
row.names = FALSE)
# Save train and test error
write.csv(as.data.frame(xgb.fit$evaluation_log),
paste0(output_folder,'/error.csv'),
row.names = FALSE)
ggplot(xgb.fit$evaluation_log) +
geom_line(aes(iter, train_merror), col='blue') +
geom_line(aes(iter, test_merror), col='orange')
ggsave(paste0(output_folder,"/error.png"))
# Save variables importance
importance_matrix <- xgb.importance(colnames(xgb.train),
model = xgb.fit)
jpeg(file=paste0(output_folder,"/var_importance.jpeg"),
width = 200, height = 150, units='mm', res = 300)
xgb.plot.importance(importance_matrix = importance_matrix[1:20])
dev.off()