EDA_Porto_Seguro_gh.R



### EXPLORATORY ANALYSIS

#COMPETITION: Porto Seguro's Safe Driver Prediction

#URL OF THE DATASET: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data

### loading libraries
library('ggplot2')
library('dplyr')
library('readr')
library('gridExtra')


### loading files (assumes that files are in current working directory)
train <- read_csv('./train.csv')
test <- read_csv('./test.csv')


str(train)


###################################################################################################
                                        ### TRAIN DATASET ###
###################################################################################################

## missing data ##

sum(is.na(train)) # no NA's in the train dataset

# according with data description, missing values/NA's are with -1 value

# check how many rows with -1 values per column
na_vector <- apply(train, 2, function(col) sum(col %in% -1)) #some columns have NA values
na_vector

# check how many columns are columns with NA's
na_features <- na_vector[na_vector!=0] #
length(na_features) # 13 columns with na's

# evaluate percentage of -1 values per feature
options(scipen=999) #to remove scientific notation
na_percentage <- round(apply(train, 2, function(col) sum(col %in% -1)/length(col)*100),3)
na_percentage
#some columns have more than half of its values with missing data

# focusing only on features with missing data
features_names <- colnames(train) #get features names to plot later
na_percentage_df <- data.frame(features_names,na_percentage) # create dataframe with features names and % of missing data

# plotting just features with na's to check percentage
features_with_na <- filter(na_percentage_df,na_percentage!=0)
features_with_na_plot <- ggplot(data=features_with_na, aes(x=features_names,y=na_percentage)) 
features_with_na_plot <- features_with_na_plot + geom_bar(stat="identity", width = 0.5) + coord_flip()
features_with_na_plot
# plot suggest that at least one feature might be removed because of missing data (more than 60% missing)


# funtion na_if from dplyr used to replace -1 values with proper NA's on train dataset
train1 <- na_if(train,-1)


# explore columns to check what feature columns need to be converted to factor
summary(train1)

#get columns names and indexes for my reference
col_indexes = data.frame(colnames(train1))

# convert features to factors
categorical_indexes <- c(2:20,24:35,43:59)
for(i in categorical_indexes)
{
  train1[[i]] <- as.factor(train1[[i]])
}


summary(train1)

# mode function to categorical features exploration
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


# subset of categorical features
categorical_train1 <- select(train1, categorical_indexes) 
#48 features are categorical, ordinal or binay in my dataset

# functions to get statistics for categorical features in train dataset
cardinality <- apply(categorical_train1, 2, function(col) length(unique(col)))
mode_value <- apply(categorical_train1, 2, function(col) Mode(col))
mode_freq <- apply(categorical_train1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc <- apply(categorical_train1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))

# generate a dataframe with statistics over categorical features
categorical_stats <- data.frame(cardinality,mode_value,mode_freq,mode_perc)
#from the observation of the dataframe generated by command above we conclude that: 
# - most of the low cardinality features have an imbalanced distribution towards mode
# - features ps_car_05_cat and ps_car_03_cat have more NA values than non-NA values (as I have shown on NA's plot)
# - there are categorical features with high cardinality

# subset of continuous features
continuous_train1 <- select(train1, -categorical_indexes)
continuous_train1 <- continuous_train1[,-1] # id feature is not relevant

summary(continuous_train1)

# creating plot objects for continuous features to visually check distribution

# ps_reg_01
ps_reg_01_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_01))
ps_reg_01_his <- ps_reg_01_his + labs(x="ps_reg_01_his",y="Occurences")
ps_reg_01_his <- ps_reg_01_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_01_his <- ps_reg_01_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_reg_02
ps_reg_02_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_02))
ps_reg_02_his <- ps_reg_02_his + labs(x="ps_reg_02_his",y="Occurences")
ps_reg_02_his <- ps_reg_02_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_02_his <- ps_reg_02_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_reg_03
ps_reg_03_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_reg_03))
ps_reg_03_his <- ps_reg_03_his + labs(x="ps_reg_03_his",y="Occurences")
ps_reg_03_his <- ps_reg_03_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_reg_03_his <- ps_reg_03_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_car_12
ps_car_12_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_12))
ps_car_12_his <- ps_car_12_his + labs(x="ps_car_12_his",y="Occurences")
ps_car_12_his <- ps_car_12_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_12_his <- ps_car_12_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_car_13
ps_car_13_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_13))
ps_car_13_his <- ps_car_13_his + labs(x="ps_car_13_his",y="Occurences")
ps_car_13_his <- ps_car_13_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_13_his <- ps_car_13_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_car_14
ps_car_14_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_14))
ps_car_14_his <- ps_car_14_his + labs(x="ps_car_14_his",y="Occurences")
ps_car_14_his <- ps_car_14_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_14_his <- ps_car_14_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_car_15
ps_car_15_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_car_15))
ps_car_15_his <- ps_car_15_his + labs(x="ps_car_15_his",y="Occurences")
ps_car_15_his <- ps_car_15_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_car_15_his <- ps_car_15_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_calc_01
ps_calc_01_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_01))
ps_calc_01_his <- ps_calc_01_his + labs(x="ps_calc_01_his",y="Occurences")
ps_calc_01_his <- ps_calc_01_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_01_his <- ps_calc_01_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_calc_02
ps_calc_02_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_02))
ps_calc_02_his <- ps_calc_02_his + labs(x="ps_calc_02_his",y="Occurences")
ps_calc_02_his <- ps_calc_02_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_02_his <- ps_calc_02_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")


# ps_calc_03
ps_calc_03_his <- ggplot(continuous_train1, aes(x=continuous_train1$ps_calc_03))
ps_calc_03_his <- ps_calc_03_his + labs(x="ps_calc_03_his",y="Occurences")
ps_calc_03_his <- ps_calc_03_his + geom_histogram( colour="black", aes(y=..count..,fill=..count..))
ps_calc_03_his <- ps_calc_03_his + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")



# plot all the above in subplots
gridplot <- grid.arrange(ps_reg_01_his, ps_reg_02_his,ps_reg_03_his,ps_car_12_his,
                         ps_car_13_his, ps_car_14_his, ps_car_15_his, ps_calc_01_his,
                         ps_calc_02_his, ps_calc_03_his,  nrow = 5)

#from plots above I conclude that:
# - most of the continuous features have a skewed distribution 
# - this skewed distribution might suggest the presence of outliers
# - there are at least 4 features with low cardinality for a continuous feature which might suggest
#a conversion to factor:
#     - ps_reg_01    # feature index 21
#     - ps_calc_01   # feature index 40
#     - ps_calc_02   # feature index 41
#     - ps_calc_03   # feature index 42

# convert features to factors (including additional indexes)
categorical_indexes <- c(2:21,24:35,40:59)
for(i in categorical_indexes)
{
  train1[[i]] <- as.factor(train1[[i]])
}


# update subset of categorical features
categorical_train1 <- select(train1, categorical_indexes) 
#52 features are categorical, ordinal or binay in my dataset

# functions to get statistics for categorical features in train1 dataset
cardinality <- apply(categorical_train1, 2, function(col) length(unique(col)))
mode_value <- apply(categorical_train1, 2, function(col) Mode(col))
mode_freq <- apply(categorical_train1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc <- apply(categorical_train1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))

# generate a dataframe with statistics over categorical features
categorical_stats <- data.frame(cardinality,mode_value,mode_freq,mode_perc)


## handling with missing values
features_with_na_plot
# feature ps_car_03_cat has more than 60% of missing values
# feature ps_car_05_cat has more than 40% of missing values


# feature ps_car_03_cat
# first, lets check the proportion of claimers by ps_car_03_cat categories
ps_car_03_cat_0_claim_perc <- nrow(filter(train1,ps_car_03_cat == 0 & target==1))/nrow(filter(train1,ps_car_03_cat == 0))*100
ps_car_03_cat_1_claim_perc <- nrow(filter(train1,ps_car_03_cat == 1 & target==1))/nrow(filter(train1,ps_car_03_cat == 1))*100
ps_car_03_cat_NA_claim_perc <- nrow(filter(train1,is.na(train1$ps_car_03_cat) & target==1))/nrow(filter(train1,is.na(train1$ps_car_03_cat)))*100
ps_car_03_cat_perc <- c(ps_car_03_cat_0_claim_perc,ps_car_03_cat_1_claim_perc,ps_car_03_cat_NA_claim_perc)

ps_car_03_cat_perc_df <- data.frame(c("0","1","NA"),ps_car_03_cat_perc)
names(ps_car_03_cat_perc_df) <- c('levels','claimers_perc') 
ps_car_03_cat <- ggplot(data=ps_car_03_cat_perc_df, aes(x=levels,y=claimers_perc)) + geom_bar(stat="identity",aes(fill=claimers_perc), width = 0.5) 
ps_car_03_cat
#it seems that there is a difference in proportion of claimers across different categories of ps_car_03_cat_perc
#ideally I would do a statistical test to prove it...

# However, I will remove ps_car_03_cat (module book suggest to remove features with more than 60% of missing data)
train2 <- train1[ ,-which(names(train1) %in% 'ps_car_03_cat')]


# feature ps_car_05_cat
# first, lets check the proportion of claimers by ps_car_05_cat categories
ps_car_05_cat_0_claim_perc <- nrow(filter(train1,ps_car_05_cat == 0 & target==1))/nrow(filter(train1,ps_car_05_cat == 0))*100
ps_car_05_cat_1_claim_perc <- nrow(filter(train1,ps_car_05_cat == 1 & target==1))/nrow(filter(train1,ps_car_05_cat == 1))*100
ps_car_05_cat_NA_claim_perc <- nrow(filter(train1,is.na(train1$ps_car_05_cat) & target==1))/nrow(filter(train1,is.na(train1$ps_car_05_cat)))*100
ps_car_05_cat_perc <- c(ps_car_05_cat_0_claim_perc,ps_car_05_cat_1_claim_perc,ps_car_05_cat_NA_claim_perc)

ps_car_05_cat_perc_df <- data.frame(c("0","1","NA"),ps_car_05_cat_perc)
names(ps_car_05_cat_perc_df) <- c('levels','claimers_perc') 
ps_car_05_cat <- ggplot(data=ps_car_05_cat_perc_df, aes(x=levels,y=claimers_perc)) + geom_bar(stat="identity",aes(fill=claimers_perc), width = 0.5) 
ps_car_05_cat
#It seems that levels 0 and 1 have no difference in percentage of claimers but NA level is below these two features
#ideally, I would do a statistical test to prove this difference...

# I will update ps_car_05_cat_perc to encode missing/non-missing for this category as recommended by module book
train2 <- within(train2, ps_car_05_cat[ps_car_05_cat == 0 | ps_car_05_cat == 1] <- 1)
train2 <- within(train2, ps_car_05_cat[is.na(ps_car_05_cat)] <- 0)
summary(train2$ps_car_05_cat) # ok


# on remaining features with missing values, I will impute values, depending on what type of feature it is

#ps_reg_03 is continuous and skewed -> median imputation
train2 <- within(train2, ps_reg_03[is.na(ps_reg_03)] <- median(train2$ps_reg_03,na.rm=T))

#ps_ind_05_cat is categorical with one predominant category ->  imputation of predominant category
train2 <- within(train2, ps_ind_05_cat[is.na(ps_ind_05_cat)] <- Mode(train2$ps_ind_05_cat))

#ps_ind_04_cat is categorical with very few NA's ->  imputation of predominant category
train3 <- within(train2, ps_ind_04_cat[is.na(ps_ind_04_cat)] <- Mode(train2$ps_ind_04_cat))

#ps_ind_02_cat is categorical with one predominant category ->  imputation of predominant category
train3 <- within(train3, ps_ind_02_cat[is.na(ps_ind_02_cat)] <- Mode(train3$ps_ind_02_cat))

#ps_car_14 is continuous and normal distributed -> mean imputation
train3 <- within(train3, ps_car_14[is.na(ps_car_14)] <- mean(train3$ps_car_14,na.rm=T))

#ps_car_11 is categorical with one predominant category ->  imputation of predominant category
train4 <- within(train3, ps_car_11[is.na(ps_car_11)] <- Mode(train3$ps_car_11))

#ps_car_09_cat is categorical with one predominant category ->  imputation of predominant category
train4 <- within(train4, ps_car_09_cat[is.na(ps_car_09_cat)] <- Mode(train4$ps_car_09_cat))

#ps_car_07_cat is categorical with one predominant category ->  imputation of predominant category
train4 <- within(train4, ps_car_07_cat[is.na(ps_car_07_cat)] <- Mode(train4$ps_car_07_cat))

#ps_car_02_cat is categorical with one predominant category ->  imputation of predominant category
train5 <- within(train4, ps_car_02_cat[is.na(ps_car_02_cat)] <- Mode(train4$ps_car_02_cat))

#ps_car_01_cat is categorical with one predominant category ->  imputation of predominant category
train6 <- within(train5, ps_car_01_cat[is.na(ps_car_01_cat)] <- Mode(train5$ps_car_01_cat))

sum(is.na(train6)) #still one occurence with na that escaped from filters...

# remaining occurence of NA is on ps_car_12 feature -> median imputation
train7 <- within(train6, ps_car_12[is.na(ps_car_12)] <- median(train6$ps_car_12,na.rm=T))

sum(is.na(train7)) # train dataset with no missing data!

#write clean train dataframe to current directory
write_csv(train7, path = "./traindf1.csv", na = "NA")


###################################################################################################
                                     ### TEST DATASET ###
###################################################################################################
# same cleaning operations to be done in test dataset
## check NA's in test dataset
sum(is.na(test)) # no NA's in the test dataset

# according with data description, missing values/NA's are with -1 value

# check how many rows with -1 values per column
na_vector_test <- apply(test, 2, function(col) sum(col %in% -1)) #some columns have NA values
na_vector_test

# check what are columns with NA's
na_vector_test[na_vector_test!=0] #
length(na_vector_test[na_vector_test!=0]) # 12 columns with na's


# evaluate % of -1 values per feature
options(scipen=999) #to remove scientific notation
na_percentage_test <- round(apply(test, 2, function(col) sum(col %in% -1)/length(col)*100),3)
#some columns have more than half of its values with missing data

# focusing only on features with missing data
features_names_test <- colnames(test) #get features names to plot later
na_percentage_test_df <- data.frame(features_names_test,na_percentage_test) # create dataframe with features names and % of missing data

# plotting just features with na's to check percentage
features_with_na_test <- filter(na_percentage_test_df,na_percentage_test!=0)
features_with_na_plot_test <- ggplot(data=features_with_na_test, aes(x=features_names_test,y=na_percentage_test)) 
features_with_na_plot_test <- features_with_na_plot_test + geom_bar(stat="identity", width = 0.5) + coord_flip()
features_with_na_plot_test
# na percentage plot similar to train dataset


# funtion na_if from dplyr used to replace -1 values with proper NA's on test dataset
test1 <- na_if(test,-1)


# convert features to factors (including additional indexes)
categorical_indexes_test <- c(2:20,23:34,39:58)
for(i in categorical_indexes_test)
{
  test1[[i]] <- as.factor(test1[[i]])
}

# update subset of categorical features
categorical_test1 <- select(test1, categorical_indexes_test) 
#51 features are categorical, ordinal or binay in my test dataset

# functions to get statistics for categorical features in test1 dataset
cardinality_test <- apply(categorical_test1, 2, function(col) length(unique(col)))
mode_value_test <- apply(categorical_test1, 2, function(col) Mode(col))
mode_freq_test <- apply(categorical_test1, 2, function(col) sum(col==Mode(col),na.rm=T))
mode_perc_test <- apply(categorical_test1, 2, function(col) round(sum(col==Mode(col),na.rm=T)/length(col)*100,2))

# generate a dataframe with statistics over categorical features
categorical_stats_test <- data.frame(cardinality_test,mode_value_test,mode_freq_test,mode_perc_test)
# categorical stats on test set very similar to train set


## handling with missing values
features_with_na_plot_test
# feature ps_car_03_cat has more than 60% of missing values
# feature ps_car_05_cat has more than 40% of missing values


# Remove ps_car_03_cat_perc (similarly to train set)
test2 <- test1[ ,-which(names(test1) %in% 'ps_car_03_cat')]


# I will update ps_car_05_cat_perc to encode missing/non-missing (similarly to train set)
test2 <- within(test2, ps_car_05_cat[ps_car_05_cat == 0 | ps_car_05_cat == 1] <- 1)
test2 <- within(test2, ps_car_05_cat[is.na(ps_car_05_cat)] <- 0)
summary(test2$ps_car_05_cat) # ok


#on remaining features with missing values, I will impute values, with the same rationale as 
#I did on the train set

#ps_reg_03 is continuous and skewed -> median imputation
test2 <- within(test2, ps_reg_03[is.na(ps_reg_03)] <- median(test2$ps_reg_03,na.rm=T))

#ps_ind_05_cat is categorical with one predominant category ->  imputation of predominant category
test2 <- within(test2, ps_ind_05_cat[is.na(ps_ind_05_cat)] <- Mode(test2$ps_ind_05_cat))

#ps_ind_04_cat is categorical with one predominant category ->  imputation of predominant category
test3 <- within(test2, ps_ind_04_cat[is.na(ps_ind_04_cat)] <- Mode(test2$ps_ind_04_cat))

#ps_ind_02_cat is categorical with one predominant category ->  imputation of predominant category
test3 <- within(test3, ps_ind_02_cat[is.na(ps_ind_02_cat)] <- Mode(test3$ps_ind_02_cat))

#ps_car_14 is continuous and normal distributed -> mean imputation
test3 <- within(test3, ps_car_14[is.na(ps_car_14)] <- mean(test3$ps_car_14,na.rm=T))

#ps_car_09_cat is categorical with one predominant category ->  imputation of predominant category
test4 <- within(test3, ps_car_09_cat[is.na(ps_car_09_cat)] <- Mode(test3$ps_car_09_cat))

#ps_car_07_cat is categorical with one predominant category ->  imputation of predominant category
test4 <- within(test4, ps_car_07_cat[is.na(ps_car_07_cat)] <- Mode(test4$ps_car_07_cat))

#ps_car_02_cat is categorical with one predominant category ->  imputation of predominant category
test5 <- within(test4, ps_car_02_cat[is.na(ps_car_02_cat)] <- Mode(test4$ps_car_02_cat))

#ps_car_01_cat is categorical with one predominant category ->  imputation of predominant category
test6 <- within(test5, ps_car_01_cat[is.na(ps_car_01_cat)] <- Mode(test5$ps_car_01_cat))


sum(is.na(test6)) #still one occurence with na that escaped from filters...

# remaining occurence of NA is on ps_car_11 feature (categorical) -> imputation of predominant category
test7 <- within(test6, ps_car_11[is.na(ps_car_11)] <- Mode(test6$ps_car_11))

sum(is.na(train7)) # train dataset with no missing data!


### write clean test dataframe to current directory
write_csv(test7, path = "./testdf1.csv", na = "NA")