-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProject_4.Rmd
263 lines (243 loc) · 12.1 KB
/
Project_4.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
---
title: "Telecom customer churn"
output: html_notebook
---
Loading the excel dataset into R and storing it an object
```{r}
library(readxl)
cell_data <- read_excel("Cellphone.xlsx")
str(cell_data) # Checking the structure of the dataset.
summary(cell_data[,-c(1,3,4)])
```
Exploratory data analysis - EDA
```{r}
#Check for missing data
library(DataExplorer)
introduce(cell_data) # Check if the dataset has any missing values
plot_missing(cell_data) # Plot the variables, listing the missing values if any.
table(cell_data$Churn) # Check the load distribution for our dependent variable.
create_report(cell_data[,-c(1,3,4)])
```
Univariate visualisation plots with ggplot2 - Histograms
```{r}
library(ggplot2)
# We will plot histograms for the independent variables in the dataset
ggplot(cell_data, aes(x=AccountWeeks)) + geom_histogram(bins = 50, binwidth = 5, color = "Darkorchid", fill= "Aquamarine") # Histogram for AccountWeeks
ggplot(cell_data, aes(x=DayMins)) + geom_histogram(bins = 50, binwidth = 7, color = "Darkorchid", fill= "Aquamarine") # Histogram for DayMins
ggplot(cell_data, aes(x=DayCalls)) + geom_histogram(bins = 50, binwidth = 3, color = "Darkorchid", fill= "Aquamarine") # Histogram for DayCalls
ggplot(cell_data, aes(x=MonthlyCharge)) + geom_histogram(bins = 50, binwidth = 2, color = "Darkorchid", fill= "Aquamarine") # Histogram for MonthlyCharge
```
Univariate visualisation plots with ggplot2 - Density plots
```{r}
ggplot(cell_data, aes(x=AccountWeeks)) + geom_density(color = "Orange") + geom_vline(aes(xintercept=mean(AccountWeeks)),
color="Purple", linetype="dashed", size=1)
ggplot(cell_data, aes(x=DayMins)) + geom_density(color = "Orange") + geom_vline(aes(xintercept=mean(DayMins)),
color="Purple", linetype="dashed", size=1)
ggplot(cell_data, aes(x=DayCalls)) + geom_density(color = "Orange") + geom_vline(aes(xintercept=mean(DayCalls)),
color="Purple", linetype="dashed", size=1)
ggplot(cell_data, aes(x=MonthlyCharge)) + geom_density(color = "Orange") + geom_vline(aes(xintercept=mean(MonthlyCharge)),
color="Purple", linetype="dashed", size=1)
```
Histogram-Density plots
```{r}
ggplot(cell_data, aes(x=AccountWeeks)) +
geom_histogram(aes(y=..density..), bins = 50, binwidth = 5, color = "Darkorchid", fill= "Aquamarine", alpha = 0.3) +
geom_density(color = "Darkmagenta", alpha=0.7) + geom_vline(aes(xintercept=mean(AccountWeeks)),
color="DarkOrange", linetype="dashed", size=1)
ggplot(cell_data, aes(x=DayMins)) +
geom_histogram(aes(y=..density..), bins = 50, binwidth = 7, color = "Darkorchid", fill= "Aquamarine", alpha = 0.3) +
geom_density(color = "Darkmagenta", alpha=0.7) + geom_vline(aes(xintercept=mean(DayMins)),
color="DarkOrange", linetype="dashed", size=1)
ggplot(cell_data, aes(x=DayCalls)) +
geom_histogram(aes(y=..density..), bins = 50, binwidth = 3, color = "Darkorchid", fill= "Aquamarine", alpha = 0.3) +
geom_density(color = "Darkmagenta", alpha=0.7) + geom_vline(aes(xintercept=mean(DayCalls)),
color="DarkOrange", linetype="dashed", size=1)
ggplot(cell_data, aes(x=MonthlyCharge)) +
geom_histogram(aes(y=..density..), bins = 50, binwidth = 2, color = "Darkorchid", fill= "Aquamarine", alpha = 0.3) +
geom_density(color = "Darkmagenta", alpha=0.7) + geom_vline(aes(xintercept=mean(MonthlyCharge)),
color="DarkOrange", linetype="dashed", size=1)
```
Univariate visualisation plots with ggplot2 - Box plots
```{r}
library(dplyr)
# Create separate boxplots for each attribute
cell_data %>%
ggplot(aes(x="", y=AccountWeeks)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=DataUsage)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=DayMins)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=DayCalls)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=MonthlyCharge)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=OverageFee)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
cell_data %>%
ggplot(aes(x="", y=RoamMins)) +
geom_boxplot(varwidth=T, outlier.colour = "Deepskyblue2", fill="Darkorchid4") + coord_flip() + geom_jitter(colour="Darkorange",width=0.2,alpha=0.2)
```
Bi-variate analysis and plots - Correlation plot
```{r}
library(corrplot)
corr_mat <- cor(cell_data) # Shows the correlation matrix with all the variables
round(corr_mat,2)
corrplot(corr_mat, method = "number")
```
Bi-variate analysis and plots - Scatter plots
```{r}
library(ggplot2)
theme_set(
theme_bw() +
theme(legend.position = "top")
)
# Initiate a ggplot with different parameters
b <- ggplot(cell_data, aes(x = DayMins, y = DayCalls))
# Change color, shape and size
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
b <- ggplot(cell_data, aes(x = DayMins, y = MonthlyCharge))
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
b <- ggplot(cell_data, aes(x = MonthlyCharge, y = RoamMins))
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
b <- ggplot(cell_data, aes(x = MonthlyCharge, y = OverageFee))
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
b <- ggplot(cell_data, aes(x = DayMins, y = RoamMins))
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
b <- ggplot(cell_data, aes(x = AccountWeeks, y = OverageFee))
b + geom_point(color = "#00AFBB", size = 2, shape = 22) + geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95, col="DarkOrchid")
```
Outlier detection and identification with grDevices
```{r}
#Outlier detection for continious variables in our dataset
ov1 <- boxplot.stats(cell_data$DataUsage)$out
ov2 <- boxplot.stats(cell_data$DayMins)$out
ov3 <- boxplot.stats(cell_data$DayCalls)$out
ov4 <- boxplot.stats(cell_data$MonthlyCharge)$out
ov5 <- boxplot.stats(cell_data$OverageFee)$out
ov6 <- boxplot.stats(cell_data$RoamMins)$out
ov <- c(ov1,ov2,ov3,ov4,ov5,ov6)
library(EnvStats)
rosnerTest(ov, k = 161, warn = F)
```
2. Logistic Regression model
```{r}
set.seed(1000) # Set the seed to repeat the same sample set for each iteration
library(caTools) # import library caTools for splitting data
cell_split <- sample.split(cell_data$Churn, SplitRatio = 0.7) # Split the dataset into 70-30
cell_train <- subset(cell_data, cell_split==TRUE) # Create subset of train data from dataset
cell_test <- subset(cell_data, cell_split==FALSE) # Create subset of test data from dataset
prop.table(table(cell_train$Churn)) # Check the distribution of the dependent variable in train data
prop.table(table(cell_test$Churn)) # Check the distribution of the dependent variable in test data
```
Build the logistic regression model
```{r}
model <- glm(Churn ~ ., data = cell_train, family = binomial)
summary(model)
```
Checking the predictions based on our model
```{r}
pred_test <- predict(model, newdata=cell_test, type="response") # Store the prediction of unseen data into a variable.
View(pred_test) # View the probability predictions from the model of the test data.
```
Checking the confusion matrix for the model
```{r}
table(cell_test$Churn, pred_test>0.5) # Confusion matrix for the logistic regression model
```
Model accuracy parameters - Confusion matrix
```{r}
(832+26)/nrow(cell_test) # Accuracy of the model
(119+23)/nrow(cell_test) # Classification error of the model
(26/(26+119)) # Sensitivity of the model OR True positive rate
(832/(832+23)) # Specificity of the model OR True negative rate
(23/(23+832)) # False positive rate
(119/(119+26)) # False negative rate
(26/(26+23)) # Precision of the model OR Positive predicted value
```
AUC - Area under the curve
```{r}
library(ROCR)
ROCPred <- prediction(pred_test, cell_test$Churn)
as.numeric(performance(ROCPred, "auc")@y.values)
```
Performace plot ROC - curve
```{r}
cell_perf <- performance(ROCPred, "tpr", "fpr")
plot(cell_perf)
library(InformationValue)
optimalCutoff(actuals = cell_test$Churn, predictedScores = pred_test) # Checking the optimal cutoff
plotROC(actuals = cell_test$Churn, predictedScores = pred_test) # Plotting the ROC curve
```
Model evaluation metrics
```{r}
#Gini co-efficient
gini_index <- ((0.7997-0.5)/0.5) # Calculating the Gini-coefficient for the model
#Concordance and Discordance
Concordance(actuals = cell_test$Churn, predictedScores = pred_test) # Calculating the concordance and discordance of the model.
ks_stat <- InformationValue::ks_stat(actuals = cell_test$Churn, predictedScores = pred_test)
ks_stat
ks_plot(actuals = cell_test$Churn, predictedScores = pred_test)
```
3. KNN - K nearest neighbours
```{r}
# We will first normalize the data with the help of the following function
norm <- function(x) { (x-min(x)) / (max(x)-min(x)) } # Function to normalize the dataset
cell_norm <- as.data.frame(lapply(cell_data[,-1], norm)) # Apply the normalization to dataset except the dependent variable
View(cell_norm) # View the contents of the dataset
cell_norm_data <- cbind(cell_data[,1], cell_norm) # Merge the normalized data with the dependent variable
View(cell_norm_data) # View the contents of the newly formed complete dataset
```
Partitioning the data again in train and test for KNN - with normalized values
```{r}
cell_split_KNN <- sample.split(cell_norm_data$Churn, SplitRatio = 0.7) # Split the dataset into 70-30
cell_train_KNN <- subset(cell_norm_data, cell_split_KNN==TRUE) # Create subset of train data from dataset
cell_test_KNN <- subset(cell_norm_data, cell_split_KNN==FALSE) # Create subset of test data from dataset
prop.table(table(cell_train_KNN$Churn)) # Check the distribution of the dependent variable in train data
prop.table(table(cell_test_KNN$Churn)) # Check the distribution of the dependent variable in test data
```
Building KNN - classifier
```{r}
library(class)
cell_pred_KNN <- knn(cell_train_KNN[,-1], cell_test_KNN[,-1], cell_train_KNN[,1], k=19) # Starting with k = 19.
pred_KNN <- table(cell_test_KNN[,1], cell_pred_KNN) # Confusion matrix for the KNN model
pred_KNN
sum(diag(pred_KNN)/sum(pred_KNN)) # Accuracy of the KNN model
```
Re-iterate the KNN model with different inputs of k
```{r}
set.seed(1000)
cell_pred_KNN <- knn(cell_train_KNN[,-1], cell_test_KNN[,-1], cell_train_KNN[,1], k=13) # With k = 13 we get best accuracy
pred_KNN <- table(cell_test_KNN[,1], cell_pred_KNN) # Confusion matrix for the KNN model
pred_KNN
sum(diag(pred_KNN)/sum(pred_KNN)) # Accuracy of the KNN model
```
KNN-model evaluation metrics
```{r}
(846+57)/nrow(cell_test) # Accuracy of the model
(88+9)/nrow(cell_test) # Classification error of the model
(57/(57+88)) # Sensitivity of the model OR True positive rate
(846/(846+9)) # Specificity of the model OR True negative rate
(9/(9+846)) # False positive rate
(88/(88+57)) # False negative rate
(57/(57+9)) # Precision of the model OR Positive predicted value
```
KS-plot, concordance, Gini for KNN
```{r}
ks_plot(actuals = cell_test_KNN$Churn, predictedScores = cell_pred_KNN)
Concordance(actuals = cell_test_KNN$Churn, predictedScores = cell_pred_KNN)
```
4. Naive Bayes
```{r}
library(e1071)
#set.seed(1000)
cell_NB <- naiveBayes(Churn ~ ., data = cell_train_KNN)
pred_NB <- predict(cell_NB, cell_test_KNN, type = "class")
View(pred_NB)
cmat_NB <- table(cell_test_KNN[,1], pred_NB)
sum(diag(cmat_NB)/sum(cmat_NB))
```