-
Notifications
You must be signed in to change notification settings - Fork 0
/
A3.R
181 lines (147 loc) · 7.72 KB
/
A3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
library(naivebayes)
library(foreign)
library("dbscan")
library(reshape2)
library(ggplot2)
#install.packages("plot3D")
library("plot3D")
library(plotly)
#load the data file
getwd()
setwd("C:\\Users\\nick.moellers\\OneDrive - James Cook University\\GraduateDiploma\\MA5810-DataMining\\A3")
Stamps <- read.table("Stamps_withoutdupl_09.csv", header=FALSE, sep=",", dec=".")
summary(Stamps) # 9 Predictors (V1 to V9) and class labels (V10)
PB_Predictors <- Stamps[,1:9] # 9 Predictors (V1 to V9)
PB_class <- Stamps[,10] # Class labels (V10)
PB_class <- ifelse(PB_class == 'no',0,1) # Inliers (class "no") = 0, Outliers (class "yes") = 1
normalise <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
PB_Predictors.normal <- as.data.frame(lapply(PB_Predictors, normalise))
summary(PB_Predictors.normal)
#naive_bayes(version)
#Activity 1: Principal Component Analysis
#1 Perform Principal Component Analysis (PCA) on the Stamps data in the 9-dimensional space of the numerical predictors
pcaStamps <- prcomp(PB_Predictors, scale. = TRUE) #PCA step
pcaStamps
#show the Proportion of Variance Explained (PVE) for each of the nine resulting principal components
(PVE <- (pcaStamps$sdev^2)/sum(pcaStamps$sdev^2)) #PVE step
##Plot the accumulated sum of PVE for the first m components, as a function of m, and discuss the result:
cumPVE <- cumsum(PVE)
plotCumPVE <- qplot(c(1:9), cumPVE) +
geom_line() +
xlab("Principal Component") +
ylab(NULL) +
ggtitle("Cumulative Sum PVE Plot") +
ylim(0,1)
#grid.arrange(PVEplot, cumPVE, ncol = 2)
plotCumPVE
#cumsum(PVE)
#(a) How many components do we need to explain 90% or more of the total variance?
##We need 6 components to explain 93% of the total variance. If we had only 5 components, it would be 87%.
#(b) How much of the total variance is explained by the first three components?
##68.89% is explained by th
e first three components.
#2 plot a 3D scatter-plot of the Stamps data as represented by the first three principal components computed in the previous item
#( x = PC1 , y = PC2 , and z = PC3 ).
## Get answer from Stuart
pcaStamps.data3D <- as.data.frame(pcaStamps$x[,1:3])
PB_class <- as.factor(PB_class)
scatter.data <- cbind.data.frame(pcaStamps.data3D, PB_class)
#Plotly Version
plotlyResult <- plot_ly(scatter.data, x = ~PC1 , y = ~PC2, z = ~PC3, color = ~PB_class)
plotlyResult
#Scatter3D version
plotResult <- scatter3D(x=pcaStamps.data3D$PC1, y=pcaStamps.data3D$PC2, z=pcaStamps.data3D$PC3, phi = 40, theta = 40)
plotResult2 <- scatter3D(x=pcaStamps.data3D$PC1, y=pcaStamps.data3D$PC2, z=pcaStamps.data3D$PC3, phi = 20, theta = 80)
plotResult3 <- scatter3D(x=pcaStamps.data3D$PC1, y=pcaStamps.data3D$PC2, z=pcaStamps.data3D$PC3, phi = 0, theta = 10)
#do the outliers (forged stamps) look easy to detect in an unsupervised
#way, assuming that the 3D visualisation of the data via PCA is a reasonable representation of the data in
#full space? How about in a supervised way? Why? Justify your answers.
The outliers look easy to detect in an unsupervised way because the majority of stamps are clustered closely, those outside the cluster
vary significantly in value, meaning anything outside of an acceptable range is an outlier; that is, the values deviate from the normal generating mechanisms within the cluster.
In a supervised way, it would be easy to identify forged stamps, as they would generally be very similar to those marked as forged in the training datasets
#Activity 2: Unsupervised outlier detection
#perform unsupervised outlier detection on the Stamps data in the 9-dimensional space of the numerical predictors ( PB_Predictors ),
#using KNN Outlier with different values of the parameter (at least the following three: k =5,25,100 ).
#For each k, produce the same 3D PCA visualisation of the data as in Activity 1 (PCA),
# - use resulting KNN Outlier Scores as a continuous, diverging colour scale.
# -- for each k, produce asecond plot
# --- RED: top-31 outliers according to the KNN Outlier Scores
# --- BLACK: other points
# Do these plots give you any insights on the values of that look more or less
# appropriate from an unsupervised perspective (ignoring the class labels)? Justify your answer.
library(dbscan)
k <- 5 # KNN parameter
KNN_Outlier <- kNNdist(x=PB_Predictors, k = k)[,k] # KNN distance (outlier score) computation
help("kNNdist")
#The following code sorts the observations according to their KNN outlier scores and displays the top 20 outliers along with their scores:
top_n <- 31 # No. of top outliers to be displayed
rank_KNN_Outlier <- order(x=KNN_Outlier, decreasing = TRUE) # Sorting (descending)
KNN_Result <- data.frame(ID = rank_KNN_Outlier, score = KNN_Outlier[rank_KNN_Outlier])
head(KNN_Result, top_n)
KNN_Result
# create a loop
kay <- c(5,25,100)
n <- length(kay)
for (var in 1:3){
print(kay[var])
KNN_Outlier <- kNNdist(x=PB_Predictors, k = kay)[,kay]
top_n <- 31 # No. of top outliers to be displayed
rank_KNN_Outlier <- order(x=KNN_Outlier, decreasing = TRUE) # Sorting (descending)
KNN_Result <- data.frame(ID = rank_KNN_Outlier, score = KNN_Outlier[rank_KNN_Outlier])
print(head(KNN_Result, 3))
#KNN_Result
plot(KNN_Result)
}
You have to use the knn.cv function; then use this weird function which is called rocplot or something like that
- it's not in a package, it's one you have to define yourself.
Basically just copy and paste that from the course notes,
then use the output from the knn.cv function as input for the rocplot function
which will give you the AUC for each k. I did NOT plot the rocplots, because they didn't ask for it. I gave the AUC for each k (which is what I'm pretty sure they're asking for).
#Activity 3
In this third activity you are asked to:
1. Perform supervised classification of the Stamps data, using a KNN classifier with the same values of
as used in Activity 2 (unsupervised outlier detection). For each classifier (that is, each value of ),
compute the Area Under the Curve ROC (AUC-ROC) in a Leave-One-Out Cross-Validation (LOOCV)
scheme.
library(class)
library(ROCR)
rocplot <- function(pred, truth){
predobj <- prediction(pred, truth)
ROC <- performance(predobj, "tpr", "fpr")
plot(ROC) # Plot the ROC Curve
auc <- performance(predobj, measure = "auc")
auc <- [email protected][[1]]
return(auc) # Return the Area Under the Curve ROC
}
kay <- c(5,25,100)
n <- length(kay)
for (var in 1:3){
Pred_class <- knn.cv(train=PB_Predictors, cl=PB_class, k=kay, prob = FALSE)
Pred_prob <- attr(Pred_class, "prob")
Pred_prob <- ifelse(Pred_class=='+', Pred_prob, 1 - Pred_prob) # Make sure probabilities are for class "+"
AUC[var] <- rocplot(pred=Pred_prob, truth=PB_class)
#editing
AUC[k] <- rocplot(pred=Pred_prob, truth=class_labe
2. Compare the resulting (supervised) KNN classification performance for each value of , against the
classification performance obtained in an unsupervised way by the KNN Outlier method with the same
value of . Notice that, if we rescale the KNN Outlier Scores (obtained in Activity 2 (unsupervised outlier
detection)) into the interval, these scores can be interpreted as outlier probabilities, which can
then be compared with the class labels (ground truth) in PB_class to compute an AUC-ROC value. This
way, for each value of , the AUC-ROC of the supervised KNN classifier can be compared with the
m
m
k k = 5, 25, 100 k
k
k
k
k
k
k
[0, 1]
k
AUC-ROC of KNN Outlier as an unsupervised classifier. Compare the performances of the supervised
versus unsupervised classifiers and discuss the results. For example, recalling that the supervised
method makes use of the class labels, whereas the unsupervised method doesn't, what can you
conclude considering there are applications where class labels are not available?