-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project_7.Rmd
271 lines (240 loc) · 11.3 KB
/
Project_7.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
---
title: "Web & Social Media Analytics"
output: html_notebook
---
Import the required libraries
```{r}
library(tm) # Library for text mining
#install the required libraries
#install.packages("twitteR")
library(twitteR) # For accessing Twitter API
#install.packages("ROAuth")
library(ROAuth) # For fetching tweets of Twitter
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
#install.packages("sentimentr")
library(sentimentr)
library(RCurl)
library(syuzhet)
library(RColorBrewer)
```
Importing the dataset
```{r}
web_data <- read.csv("Dataset.csv",header = T) # Import the dataset to a variable
head(web_data) # View the top few rows of the dataset
str(web_data) # View the structure of the dataset
# web_data[web_data==""] <- NA # Convert the empty strings in the dataset to NA
#library(DataExplorer)
#plot_missing(web_data) # Identify missing values from the dataset
#web_data <- na.omit(web_data) # Remove observations where data is absent
```
Creating and manipulating the text in the corpus
```{r}
corpus <- VCorpus(VectorSource(web_data$description)) # Creating a corpus only for the description column of the dataset
```
Manipulating the text in the corpus
```{r}
corpus = tm_map(corpus, content_transformer(tolower)) # Convert contents of the corpus to lower-case
corpus = tm_map(corpus, content_transformer(removePunctuation)) # Remove punctuations from the contents of the corpus
corpus = tm_map(corpus, content_transformer(removeWords), c("and", "the", stopwords("english"))) # Remove stopwords and articles
corpus = tm_map(corpus, content_transformer(stripWhitespace)) # Remove extra whitespaces from the corpus
corpus = tm_map(corpus, content_transformer(stemDocument)) # Stem document
pal<- brewer.pal(8, "Set1") # Setting the color brewer palette to dark shade
wordcloud(corpus,colors=pal,max.words=50) # Wordcloud after text manipulation is complete
wordcloud(corpus,colors=pal,max.words=100)
```
Creating a Document Term Matrix (DTM)
```{r}
dtm = DocumentTermMatrix(corpus) # Create a document term matrix
dtm
```
Create a dataframe with required
```{r}
df = as.data.frame(as.matrix(dtm))
colnames(df) = make.names(colnames(df)) # Rename the column names of the dataframe
df$deal = web_data$deal # Append the dependent varaible to our dataframe
table(df$deal) # Fetch the total deals from the dataset
```
Build a CART model
```{r}
library(rpart)
library(rpart.plot)
web_cart = rpart(deal ~ ., data=df, method="class") # model the CART with dataframe and with deal as dependent varaible
prp(web_cart, extra=2) # CART diagram
rpart.plot(web_cart)
```
Evaluate the performance of the CART model
```{r}
pred_cart <- predict(web_cart, data=df, type="class") # Predict for the CART model
first_cart <- table(df$deal, pred_cart) # Confusion matrix of the CART model
first_cart
accuracy_cart <- sum(diag(first_cart))/sum(first_cart) # Calculating the accuracy of the CART model
error_cart <- (1-accuracy_cart) # Calculating the classification error of the CART model
sensitivity_cart <- first_cart[2,"TRUE"]/(first_cart[2,"TRUE"] + first_cart[2,"FALSE"]) # True Positive Rate of the CART model
specificity_cart <- first_cart[1,"FALSE"]/(first_cart[1,"FALSE"] + first_cart[1,"TRUE"]) # True Negative Rate of the CART model
precision_cart <- first_cart[2,"TRUE"]/(first_cart[2,"TRUE"] + first_cart[1,"TRUE"]) # Precision of the CART model
prevalance_cart <- (first_cart[2,"FALSE"]+first_cart[2,"TRUE"])/sum(first_cart) # Prevalance of the CART model
accuracy_cart
error_cart
sensitivity_cart
specificity_cart
precision_cart
prevalance_cart
```
Build a Random Forest model
```{r}
library(randomForest)
set.seed(123) # Set seed to generate the same sample over multiple iterations
web_rf <- randomForest(deal ~ ., data=df) # model the RF with dataframe and with deal as dependent varaible
```
Evaluate the performance of the RF model
```{r}
pred_RF = predict(web_rf, data=df) # Predict for the RF model
first_RF <- table(df$deal, pred_RF>= 0.5) # Confusion matrix of the RF model with probabilities greather than 0.5
first_RF
accuracy_RF = sum(diag(first_RF))/sum(first_RF) # Calculating the accuracy of the RF model
error_RF <- (1-accuracy_RF) # Calculating the classification error of the RF model
sensitivity_RF <- first_RF[2,"TRUE"]/(first_RF[2,"TRUE"] + first_RF[2,"FALSE"]) # True Positive Rate of the RF model
specificity_RF <- first_RF[1,"FALSE"]/(first_RF[1,"FALSE"] + first_RF[1,"TRUE"]) # True Negative Rate of the RF model
precision_RF <- first_RF[2,"TRUE"]/(first_RF[2,"TRUE"] + first_RF[1,"TRUE"]) # Precision of the RF model
prevalance_RF <- (first_RF[2,"FALSE"]+first_RF[2,"TRUE"])/sum(first_RF) # Prevalance of the RF model
accuracy_RF
error_RF
sensitivity_RF
specificity_RF
precision_RF
prevalance_RF
varImpPlot(web_rf, main = 'Variable Importance Plot: Shark Tank', type = 2) # variable importance plot for RF model
```
Build a Logistic Regression model
```{r}
set.seed(123) # Set seed to generate the same sample over multiple iterations
web_LR = glm(deal~., data = df) # # model the Logistic Regression with dataframe and with deal as dependent varaible
```
Evaluate the performance of Logistic Regression model
```{r}
pred_LR = predict(web_LR, data=df) # Predict for the LR model
first_LR <- table(df$deal, pred_LR > 0.5) # Confusion matrix of the LR model with probabilities greather than 0.5
first_LR
accuracy_LR = sum(diag(first_LR))/sum(first_LR) # Calculating the accuracy of the LR model
error_LR <- (1-accuracy_LR) # Calculating the classification error of the LR model
sensitivity_LR <- first_LR[2,"TRUE"]/(first_LR[2,"TRUE"] + first_LR[2,"FALSE"]) # True Positive Rate of the LR model
specificity_LR <- first_LR[1,"FALSE"]/(first_LR[1,"FALSE"] + first_LR[1,"TRUE"]) # True Negative Rate of the LR model
precision_LR <- first_LR[2,"TRUE"]/(first_LR[2,"TRUE"] + first_LR[1,"TRUE"]) # Precision of the LR model
prevalance_LR <- (first_LR[2,"FALSE"]+first_LR[2,"TRUE"])/sum(first_LR) # Prevalance of the LR model
accuracy_LR
error_LR
sensitivity_LR
specificity_LR
precision_LR
prevalance_LR
```
Binding the data frame with ratio variable
```{r}
df$ratio <- (web_data$askedFor)/(web_data$valuation) # Variable is the ratio of Asked For and Valuation
```
Re-run CART model
```{r}
web_cart_R = rpart(deal ~ ., data=df, method="class") # CART model with Ratio variable
prp(web_cart_R, extra=2) # CART diagram
rpart.plot(web_cart_R)
```
Re-Evaluate performance of the CART model
```{r}
pred_cart_R <- predict(web_cart_R, data=df, type="class") # Predict for the CART model
final_cart <- table(df$deal, pred_cart_R) # Confusion matrix of the CART model
final_cart
accuracy_cart_R <- sum(diag(final_cart))/sum(final_cart) # Calculating the accuracy of the CART model
error_cart_R <- (1-accuracy_cart_R) # Calculating the classification error of the CART model
sensitivity_cart_R <- final_cart[2,"TRUE"]/(final_cart[2,"TRUE"] + final_cart[2,"FALSE"]) # True Positive Rate of the CART model
specificity_cart_R <- final_cart[1,"FALSE"]/(final_cart[1,"FALSE"] + final_cart[1,"TRUE"]) # True Negative Rate of the CART model
precision_cart_R <- final_cart[2,"TRUE"]/(final_cart[2,"TRUE"] + final_cart[1,"TRUE"]) # Precision of the CART model
prevalance_cart_R <- (final_cart[2,"FALSE"]+final_cart[2,"TRUE"])/sum(final_cart) # Prevalance of the CART model
accuracy_cart_R
error_cart_R
sensitivity_cart_R
specificity_cart_R
precision_cart_R
prevalance_cart_R
```
Re-run RF model
```{r}
web_rf_R <- randomForest(deal ~ ., data=df) # RF model with Ratio variable
```
Re-Evaluate performance of the RF model
```{r}
pred_RF_R = predict(web_rf_R, data=df) # Predict for the RF model
final_RF <- table(df$deal, pred_RF_R>= 0.5) # Confusion matrix of the RF model with probabilities greather than 0.5
final_RF
accuracy_RF_R = sum(diag(final_RF))/sum(final_RF) # Calculating the accuracy of the RF model
error_RF_R <- (1-accuracy_RF_R) # Calculating the classification error of the RF model
sensitivity_RF_R <- final_RF[2,"TRUE"]/(final_RF[2,"TRUE"] + final_RF[2,"FALSE"]) # True Positive Rate of the RF model
specificity_RF_R <- final_RF[1,"FALSE"]/(final_RF[1,"FALSE"] + final_RF[1,"TRUE"]) # True Negative Rate of the RF model
precision_RF_R <- final_RF[2,"TRUE"]/(final_RF[2,"TRUE"] + final_RF[1,"TRUE"]) # Precision of the RF model
prevalance_RF_R <- (final_RF[2,"FALSE"]+final_RF[2,"TRUE"])/sum(final_RF) # Prevalance of the RF model
accuracy_RF_R
error_RF_R
sensitivity_RF_R
specificity_RF_R
precision_RF_R
prevalance_RF_R
varImpPlot(web_rf_R, main = 'Variable Importance Plot with Ratio: Shark Tank', type = 2) # variable importance plot for RF model
```
Re-Run LR model
```{r}
web_LR_R = glm(deal~., data = df) # # model the Logistic Regression with dataframe and with deal as dependent varaible
```
Evaluate the performance of Logistic Regression model
```{r}
pred_LR_R = predict(web_LR_R, data=df) # Predict for the LR model
final_LR <- table(df$deal, pred_LR_R > 0.5) # Confusion matrix of the LR model with probabilities greather than 0.5
final_LR
accuracy_LR_R = sum(diag(final_LR))/sum(final_LR) # Calculating the accuracy of the LR model
error_LR_R <- (1-accuracy_LR) # Calculating the classification error of the LR model
sensitivity_LR_R <- final_LR[2,"TRUE"]/(final_LR[2,"TRUE"] + final_LR[2,"FALSE"]) # True Positive Rate of the LR model
specificity_LR_R <- final_LR[1,"FALSE"]/(final_LR[1,"FALSE"] + final_LR[1,"TRUE"]) # True Negative Rate of the LR model
precision_LR_R <- final_LR[2,"TRUE"]/(final_LR[2,"TRUE"] + final_LR[1,"TRUE"]) # Precision of the LR model
prevalance_LR_R <- (final_LR[2,"FALSE"]+final_LR[2,"TRUE"])/sum(final_LR) # Prevalance of the LR model
accuracy_LR_R
error_LR_R
sensitivity_LR_R
specificity_LR_R
precision_LR_R
prevalance_LR_R
```
Summary of Models Before and After
```{r}
library(formattable)
library(data.table)
# CART model summary - Before adding ratio variable
CART_Before <- as.table(c(accuracy_cart,error_cart,sensitivity_cart,specificity_cart,precision_cart,prevalance_cart))
names(CART_Before) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# CART model summary - After adding ratio variable
CART_After <- as.table(c(accuracy_cart_R,error_cart_R,sensitivity_cart_R,specificity_cart_R,precision_cart_R,prevalance_cart_R))
names(CART_After) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# CART model evaluation comparison
CART_Before
CART_After
# RF model summary - Before adding ratio variable
RF_Before <- as.table(c(accuracy_RF,error_RF,sensitivity_RF,specificity_RF,precision_RF,prevalance_RF))
names(RF_Before) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# RF model summary - After adding ratio variable
RF_After <- as.table(c(accuracy_RF_R,error_RF_R,sensitivity_RF_R,specificity_RF_R,precision_RF_R,prevalance_RF_R))
names(RF_After) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# RF model evaluation comparison
RF_Before
RF_After
# LR model summary - Before adding ratio variable
LR_Before <- as.table(c(accuracy_LR,error_LR,sensitivity_LR,specificity_LR,precision_LR,prevalance_LR))
names(LR_Before) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# LR model summary - After adding ratio variable
LR_After <- as.table(c(accuracy_LR_R,error_LR_R,sensitivity_LR_R,specificity_LR_R,precision_LR_R,prevalance_LR_R))
names(LR_After) <- c("Accuracy", "Error", "Sensitivity","Specificity","Precision","Prevalance")
# LR model evaluation comparison
LR_Before
LR_After
```