-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA_Porto_Seguro_gh.R
432 lines (296 loc) · 19.9 KB
/
EDA_Porto_Seguro_gh.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
### EXPLORATORY ANALYSIS
#COMPETITION: Porto Seguro's Safe Driver Prediction
#URL OF THE DATASET: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
### loading libraries
library('ggplot2')
library('dplyr')
library('readr')
library('gridExtra')
### loading files (assumes that files are in current working directory)
train <- read_csv('./train.csv')
test <- read_csv('./test.csv')
str(train)
###################################################################################################
### TRAIN DATASET ###
###################################################################################################
## missing data ##
sum(is.na(train)) # no NA's in the train dataset
# according with data description, missing values/NA's are with -1 value
# check how many rows with -1 values per column
na_vector <- apply(train, 2, function(col) sum(col %in% -1)) #some columns have NA values
na_vector
# check how many columns are columns with NA's
na_features <- na_vector[na_vector!=0] #
length(na_features) # 13 columns with na's
# evaluate percentage of -1 values per feature
options(scipen=999) #to remove scientific notation
na_percentage <- round(apply(train, 2, function(col) sum(col %in% -1)/length(col)*100),3)
na_percentage
#some columns have more than half of its values with missing data
# focusing only on features with missing data
features_names <- colnames(train) #get features names to plot later
na_percentage_df <- data.frame(features_names,na_percentage) # create dataframe with features names and % of missing data
# plotting just features with na's to check percentage
features_with_na <- filter(na_percentage_df,na_percentage!=0)
features_with_na_plot <- ggplot(data=features_with_na, aes(x=features_names,y=na_percentage))
features_with_na_plot <- features_with_na_plot + geom_bar(stat="identity", width = 0.5) + coord_flip()
features_with_na_plot
# plot suggest that at least one feature might be removed because of missing data (more than 60% missing)
# funtion na_if from dplyr used to replace -1 values with proper NA's on train dataset
train1 <- na_if(train,-1)
# explore columns to check what feature columns need to be converted to factor
summary(train1)
#get columns names and indexes for my reference
col_indexes = data.frame(colnames(train1))
# convert features to factors
categorical_indexes <- c(2:20,24:35,43:59)
for(i in categorical_indexes)
{
train1[[i]] <- as.factor(train1[[i]])
}
summary(train1)
# mode function to categorical features exploration
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# subset of categorical features
categorical_train1 <- select(train1, categorical_indexes)
#48 features are categorical, ordinal or binay in my dataset
# functions to get statistics for categorical features in train dataset
cardinality <- apply(categorical_train1, 2, function(col) length(unique(col)))
mode_value <- apply(categorical_train1, 2, function(col) Mode(col))
mode_freq <- apply(categorical_train1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc <- apply(categorical_train1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))
# generate a dataframe with statistics over categorical features
categorical_stats <- data.frame(cardinality,mode_value,mode_freq,mode_perc)
#from the observation of the dataframe generated by command above we conclude that:
# - most of the low cardinality features have an imbalanced distribution towards mode
# - features ps_car_05_cat and ps_car_03_cat have more NA values than non-NA values (as I have shown on NA's plot)
# - there are categorical features with high cardinality
# subset of continuous features
continuous_train1 <- select(train1, -categorical_indexes)
continuous_train1 <- continuous_train1[,-1] # id feature is not relevant
summary(continuous_train1)
# creating plot objects for continuous features to visually check distribution
# ps_reg_01
ps_reg_01_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_01))
ps_reg_01_his <- ps_reg_01_his + labs(x="ps_reg_01_his",y="Occurences")
ps_reg_01_his <- ps_reg_01_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_01_his <- ps_reg_01_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_reg_02
ps_reg_02_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_02))
ps_reg_02_his <- ps_reg_02_his + labs(x="ps_reg_02_his",y="Occurences")
ps_reg_02_his <- ps_reg_02_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_02_his <- ps_reg_02_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_reg_03
ps_reg_03_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_03))
ps_reg_03_his <- ps_reg_03_his + labs(x="ps_reg_03_his",y="Occurences")
ps_reg_03_his <- ps_reg_03_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_03_his <- ps_reg_03_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_car_12
ps_car_12_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_12))
ps_car_12_his <- ps_car_12_his + labs(x="ps_car_12_his",y="Occurences")
ps_car_12_his <- ps_car_12_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_12_his <- ps_car_12_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_car_13
ps_car_13_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_13))
ps_car_13_his <- ps_car_13_his + labs(x="ps_car_13_his",y="Occurences")
ps_car_13_his <- ps_car_13_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_13_his <- ps_car_13_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_car_14
ps_car_14_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_14))
ps_car_14_his <- ps_car_14_his + labs(x="ps_car_14_his",y="Occurences")
ps_car_14_his <- ps_car_14_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_14_his <- ps_car_14_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_car_15
ps_car_15_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_15))
ps_car_15_his <- ps_car_15_his + labs(x="ps_car_15_his",y="Occurences")
ps_car_15_his <- ps_car_15_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_15_his <- ps_car_15_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_calc_01
ps_calc_01_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_01))
ps_calc_01_his <- ps_calc_01_his + labs(x="ps_calc_01_his",y="Occurences")
ps_calc_01_his <- ps_calc_01_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_01_his <- ps_calc_01_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_calc_02
ps_calc_02_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_02))
ps_calc_02_his <- ps_calc_02_his + labs(x="ps_calc_02_his",y="Occurences")
ps_calc_02_his <- ps_calc_02_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_02_his <- ps_calc_02_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# ps_calc_03
ps_calc_03_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_03))
ps_calc_03_his <- ps_calc_03_his + labs(x="ps_calc_03_his",y="Occurences")
ps_calc_03_his <- ps_calc_03_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_03_his <- ps_calc_03_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
# plot all the above in subplots
gridplot <- grid.arrange(ps_reg_01_his, ps_reg_02_his,ps_reg_03_his,ps_car_12_his,
ps_car_13_his, ps_car_14_his, ps_car_15_his, ps_calc_01_his,
ps_calc_02_his, ps_calc_03_his, nrow = 5)
#from plots above I conclude that:
# - most of the continuous features have a skewed distribution
# - this skewed distribution might suggest the presence of outliers
# - there are at least 4 features with low cardinality for a continuous feature which might suggest
#a conversion to factor:
# - ps_reg_01 # feature index 21
# - ps_calc_01 # feature index 40
# - ps_calc_02 # feature index 41
# - ps_calc_03 # feature index 42
# convert features to factors (including additional indexes)
categorical_indexes <- c(2:21,24:35,40:59)
for(i in categorical_indexes)
{
train1[[i]] <- as.factor(train1[[i]])
}
# update subset of categorical features
categorical_train1 <- select(train1, categorical_indexes)
#52 features are categorical, ordinal or binay in my dataset
# functions to get statistics for categorical features in train1 dataset
cardinality <- apply(categorical_train1, 2, function(col) length(unique(col)))
mode_value <- apply(categorical_train1, 2, function(col) Mode(col))
mode_freq <- apply(categorical_train1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc <- apply(categorical_train1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))
# generate a dataframe with statistics over categorical features
categorical_stats <- data.frame(cardinality,mode_value,mode_freq,mode_perc)
## handling with missing values
features_with_na_plot
# feature ps_car_03_cat has more than 60% of missing values
# feature ps_car_05_cat has more than 40% of missing values
# feature ps_car_03_cat
# first, lets check the proportion of claimers by ps_car_03_cat categories
ps_car_03_cat_0_claim_perc <- nrow(filter(train1,ps_car_03_cat == 0 & target==1))/nrow(filter(train1,ps_car_03_cat == 0))*100
ps_car_03_cat_1_claim_perc <- nrow(filter(train1,ps_car_03_cat == 1 & target==1))/nrow(filter(train1,ps_car_03_cat == 1))*100
ps_car_03_cat_NA_claim_perc <- nrow(filter(train1,is.na(train1$ps_car_03_cat) & target==1))/nrow(filter(train1,is.na(train1$ps_car_03_cat)))*100
ps_car_03_cat_perc <- c(ps_car_03_cat_0_claim_perc,ps_car_03_cat_1_claim_perc,ps_car_03_cat_NA_claim_perc)
ps_car_03_cat_perc_df <- data.frame(c("0","1","NA"),ps_car_03_cat_perc)
names(ps_car_03_cat_perc_df) <- c('levels','claimers_perc')
ps_car_03_cat <- ggplot(data=ps_car_03_cat_perc_df, aes(x=levels,y=claimers_perc)) + geom_bar(stat="identity",aes(fill=claimers_perc), width = 0.5)
ps_car_03_cat
#it seems that there is a difference in proportion of claimers across different categories of ps_car_03_cat_perc
#ideally I would do a statistical test to prove it...
# However, I will remove ps_car_03_cat (module book suggest to remove features with more than 60% of missing data)
train2 <- train1[ ,-which(names(train1) %in% 'ps_car_03_cat')]
# feature ps_car_05_cat
# first, lets check the proportion of claimers by ps_car_05_cat categories
ps_car_05_cat_0_claim_perc <- nrow(filter(train1,ps_car_05_cat == 0 & target==1))/nrow(filter(train1,ps_car_05_cat == 0))*100
ps_car_05_cat_1_claim_perc <- nrow(filter(train1,ps_car_05_cat == 1 & target==1))/nrow(filter(train1,ps_car_05_cat == 1))*100
ps_car_05_cat_NA_claim_perc <- nrow(filter(train1,is.na(train1$ps_car_05_cat) & target==1))/nrow(filter(train1,is.na(train1$ps_car_05_cat)))*100
ps_car_05_cat_perc <- c(ps_car_05_cat_0_claim_perc,ps_car_05_cat_1_claim_perc,ps_car_05_cat_NA_claim_perc)
ps_car_05_cat_perc_df <- data.frame(c("0","1","NA"),ps_car_05_cat_perc)
names(ps_car_05_cat_perc_df) <- c('levels','claimers_perc')
ps_car_05_cat <- ggplot(data=ps_car_05_cat_perc_df, aes(x=levels,y=claimers_perc)) + geom_bar(stat="identity",aes(fill=claimers_perc), width = 0.5)
ps_car_05_cat
#It seems that levels 0 and 1 have no difference in percentage of claimers but NA level is below these two features
#ideally, I would do a statistical test to prove this difference...
# I will update ps_car_05_cat_perc to encode missing/non-missing for this category as recommended by module book
train2 <- within(train2, ps_car_05_cat[ps_car_05_cat == 0 | ps_car_05_cat == 1] <- 1)
train2 <- within(train2, ps_car_05_cat[is.na(ps_car_05_cat)] <- 0)
summary(train2$ps_car_05_cat) # ok
# on remaining features with missing values, I will impute values, depending on what type of feature it is
#ps_reg_03 is continuous and skewed -> median imputation
train2 <- within(train2, ps_reg_03[is.na(ps_reg_03)] <- median(train2$ps_reg_03,na.rm=T))
#ps_ind_05_cat is categorical with one predominant category -> imputation of predominant category
train2 <- within(train2, ps_ind_05_cat[is.na(ps_ind_05_cat)] <- Mode(train2$ps_ind_05_cat))
#ps_ind_04_cat is categorical with very few NA's -> imputation of predominant category
train3 <- within(train2, ps_ind_04_cat[is.na(ps_ind_04_cat)] <- Mode(train2$ps_ind_04_cat))
#ps_ind_02_cat is categorical with one predominant category -> imputation of predominant category
train3 <- within(train3, ps_ind_02_cat[is.na(ps_ind_02_cat)] <- Mode(train3$ps_ind_02_cat))
#ps_car_14 is continuous and normal distributed -> mean imputation
train3 <- within(train3, ps_car_14[is.na(ps_car_14)] <- mean(train3$ps_car_14,na.rm=T))
#ps_car_11 is categorical with one predominant category -> imputation of predominant category
train4 <- within(train3, ps_car_11[is.na(ps_car_11)] <- Mode(train3$ps_car_11))
#ps_car_09_cat is categorical with one predominant category -> imputation of predominant category
train4 <- within(train4, ps_car_09_cat[is.na(ps_car_09_cat)] <- Mode(train4$ps_car_09_cat))
#ps_car_07_cat is categorical with one predominant category -> imputation of predominant category
train4 <- within(train4, ps_car_07_cat[is.na(ps_car_07_cat)] <- Mode(train4$ps_car_07_cat))
#ps_car_02_cat is categorical with one predominant category -> imputation of predominant category
train5 <- within(train4, ps_car_02_cat[is.na(ps_car_02_cat)] <- Mode(train4$ps_car_02_cat))
#ps_car_01_cat is categorical with one predominant category -> imputation of predominant category
train6 <- within(train5, ps_car_01_cat[is.na(ps_car_01_cat)] <- Mode(train5$ps_car_01_cat))
sum(is.na(train6)) #still one occurence with na that escaped from filters...
# remaining occurence of NA is on ps_car_12 feature -> median imputation
train7 <- within(train6, ps_car_12[is.na(ps_car_12)] <- median(train6$ps_car_12,na.rm=T))
sum(is.na(train7)) # train dataset with no missing data!
#write clean train dataframe to current directory
write_csv(train7, path = "./traindf1.csv", na = "NA")
###################################################################################################
### TEST DATASET ###
###################################################################################################
# same cleaning operations to be done in test dataset
## check NA's in test dataset
sum(is.na(test)) # no NA's in the test dataset
# according with data description, missing values/NA's are with -1 value
# check how many rows with -1 values per column
na_vector_test <- apply(test, 2, function(col) sum(col %in% -1)) #some columns have NA values
na_vector_test
# check what are columns with NA's
na_vector_test[na_vector_test!=0] #
length(na_vector_test[na_vector_test!=0]) # 12 columns with na's
# evaluate % of -1 values per feature
options(scipen=999) #to remove scientific notation
na_percentage_test <- round(apply(test, 2, function(col) sum(col %in% -1)/length(col)*100),3)
#some columns have more than half of its values with missing data
# focusing only on features with missing data
features_names_test <- colnames(test) #get features names to plot later
na_percentage_test_df <- data.frame(features_names_test,na_percentage_test) # create dataframe with features names and % of missing data
# plotting just features with na's to check percentage
features_with_na_test <- filter(na_percentage_test_df,na_percentage_test!=0)
features_with_na_plot_test <- ggplot(data=features_with_na_test, aes(x=features_names_test,y=na_percentage_test))
features_with_na_plot_test <- features_with_na_plot_test + geom_bar(stat="identity", width = 0.5) + coord_flip()
features_with_na_plot_test
# na percentage plot similar to train dataset
# funtion na_if from dplyr used to replace -1 values with proper NA's on test dataset
test1 <- na_if(test,-1)
# convert features to factors (including additional indexes)
categorical_indexes_test <- c(2:20,23:34,39:58)
for(i in categorical_indexes_test)
{
test1[[i]] <- as.factor(test1[[i]])
}
# update subset of categorical features
categorical_test1 <- select(test1, categorical_indexes_test)
#51 features are categorical, ordinal or binay in my test dataset
# functions to get statistics for categorical features in test1 dataset
cardinality_test <- apply(categorical_test1, 2, function(col) length(unique(col)))
mode_value_test <- apply(categorical_test1, 2, function(col) Mode(col))
mode_freq_test <- apply(categorical_test1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc_test <- apply(categorical_test1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))
# generate a dataframe with statistics over categorical features
categorical_stats_test <- data.frame(cardinality_test,mode_value_test,mode_freq_test,mode_perc_test)
# categorical stats on test set very similar to train set
## handling with missing values
features_with_na_plot_test
# feature ps_car_03_cat has more than 60% of missing values
# feature ps_car_05_cat has more than 40% of missing values
# Remove ps_car_03_cat_perc (similarly to train set)
test2 <- test1[ ,-which(names(test1) %in% 'ps_car_03_cat')]
# I will update ps_car_05_cat_perc to encode missing/non-missing (similarly to train set)
test2 <- within(test2, ps_car_05_cat[ps_car_05_cat == 0 | ps_car_05_cat == 1] <- 1)
test2 <- within(test2, ps_car_05_cat[is.na(ps_car_05_cat)] <- 0)
summary(test2$ps_car_05_cat) # ok
#on remaining features with missing values, I will impute values, with the same rationale as
#I did on the train set
#ps_reg_03 is continuous and skewed -> median imputation
test2 <- within(test2, ps_reg_03[is.na(ps_reg_03)] <- median(test2$ps_reg_03,na.rm=T))
#ps_ind_05_cat is categorical with one predominant category -> imputation of predominant category
test2 <- within(test2, ps_ind_05_cat[is.na(ps_ind_05_cat)] <- Mode(test2$ps_ind_05_cat))
#ps_ind_04_cat is categorical with one predominant category -> imputation of predominant category
test3 <- within(test2, ps_ind_04_cat[is.na(ps_ind_04_cat)] <- Mode(test2$ps_ind_04_cat))
#ps_ind_02_cat is categorical with one predominant category -> imputation of predominant category
test3 <- within(test3, ps_ind_02_cat[is.na(ps_ind_02_cat)] <- Mode(test3$ps_ind_02_cat))
#ps_car_14 is continuous and normal distributed -> mean imputation
test3 <- within(test3, ps_car_14[is.na(ps_car_14)] <- mean(test3$ps_car_14,na.rm=T))
#ps_car_09_cat is categorical with one predominant category -> imputation of predominant category
test4 <- within(test3, ps_car_09_cat[is.na(ps_car_09_cat)] <- Mode(test3$ps_car_09_cat))
#ps_car_07_cat is categorical with one predominant category -> imputation of predominant category
test4 <- within(test4, ps_car_07_cat[is.na(ps_car_07_cat)] <- Mode(test4$ps_car_07_cat))
#ps_car_02_cat is categorical with one predominant category -> imputation of predominant category
test5 <- within(test4, ps_car_02_cat[is.na(ps_car_02_cat)] <- Mode(test4$ps_car_02_cat))
#ps_car_01_cat is categorical with one predominant category -> imputation of predominant category
test6 <- within(test5, ps_car_01_cat[is.na(ps_car_01_cat)] <- Mode(test5$ps_car_01_cat))
sum(is.na(test6)) #still one occurence with na that escaped from filters...
# remaining occurence of NA is on ps_car_11 feature (categorical) -> imputation of predominant category
test7 <- within(test6, ps_car_11[is.na(ps_car_11)] <- Mode(test6$ps_car_11))
sum(is.na(train7)) # train dataset with no missing data!
### write clean test dataframe to current directory
write_csv(test7, path = "./testdf1.csv", na = "NA")