-
Notifications
You must be signed in to change notification settings - Fork 0
/
finalcode.py
235 lines (193 loc) · 11.3 KB
/
finalcode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#Applying PCA to Optdigits dataset and selecting features that explain 90% of the variance and Clustering with Kmeans and reporting accuracy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random as rd
optdigits_df=pd.read_csv('optdigits_csv.csv') #Loading the dataset from the csv file
data_features=optdigits_df.iloc[:,:64] #Here we are selecing the first 64 columns which contains our data
data_labels=optdigits_df.iloc[:,64:] #Here we are selecing the last column which contains the labels associated with our data
firstimage=data_features.iloc[0,:].to_numpy().reshape(8,8) #Here we are taking the first row and reshaping it into the size of the original image and then visualising it
plt.imshow(firstimage) #This function visualises the image
print("Label of this first image : ",int(data_labels.iloc[0])) #Here we are printing the label associated with the first image
b=optdigits_df.describe() #Here we are describing the dataset and hence using it's max value to plot our bar graph that helps visualise the range of values that our dataset covers
trans=b.T
fig=plt.figure(figsize=(20,10))
plt.xlabel('Inputs')
plt.ylabel('Maximum')
plt.title('Maximum of Inputs')
print(trans.describe())
trans['max'].plot(kind='bar')
b=optdigits_df.describe() #Here we are describing the dataset and then taking the mean of the columns and plotting it in a bar graph. This helps visualise the mean values across all the columns in our dataset
trans=b.T
fig=plt.figure(figsize=(20,10))
plt.xlabel('Inputs')
plt.ylabel('Mean')
plt.title('Mean of inputs')
print(trans.describe())
trans['mean'].plot(kind='bar')
def test_train_splitter(reduced_data_features):
train,test = train_test_split(reduced_data_features,test_size=0.20, random_state=1)
division = train.shape[1] - 1
train_without_label = train.iloc[:,0:division].values
train_with_label = train.iloc[:,:].values
test_data = test.iloc[:,0:division].values
test_label = test.iloc[:,division].values
return train_with_label,train_without_label,test_data,test_label
def centroid_initialization(train_with_label,train_without_label):
i = 0
Centroids=np.array([]).reshape(no_of_features,0)
while(i<10):
rand=rd.randint(0,m-1)
# A point is guessed and label is correlated with i whose value varies from 0 - 10
# If value of label is mached with i it is added in centroids
# i is incremented
# This step help us ensure that in cluster 0 centroid with feature whose label is 0 is assigned
# This helps us as cluster zero most probably will point to digit 0
if(train_with_label[rand,no_of_features] == i):
Centroids=np.c_[Centroids,train_without_label[rand]]
i = i + 1
return Centroids
def visualize_50_digits_in_cluster(Output):
for j in range(0,10):
plt.figure(figsize=(30,10))
print('Cluster ' + str(j))
numOfRows = 50
print(str(Output[j].shape[1]) + " elements")
for i in range(0,50):
plt.subplot(5+1,10,i+1)
image = Output[j][0:no_of_features,i]
image = image.reshape(8,8)
plt.imshow(image,cmap='gray')
plt.axis('off')
plt.show()
def print_number_in_all_cluster(Output):
for i in range(0,10):
print("cluster "+str(i))
print("Number of elements in a cluster: " + str(Output[i].shape[1]))
print(Output[i][no_of_features,:])
def measuring_accuracy(Centroids,test_data,test_label):
#using the centroids calculated from training data we calculate the least distance using eucladian norm
# the point whose distance is least to a centroid the point is assigned to that centroid
number_of_test_data = test_data.shape[0]
EuclidianDistance=np.array([]).reshape(number_of_test_data,0)
for k in range(K):
tempDist=np.sum((test_data-Centroids[:,k])**2,axis=1)
EuclidianDistance=np.c_[EuclidianDistance,tempDist]
predicted_value=np.argmin(EuclidianDistance,axis=1) #tells to which cluster data point is assigned
number_of_correct_prediction = 0
for i in range(0,predicted_value.shape[0]):
if(predicted_value[i] == test_label[i]): #comapring the cluster value to actual label we find number of correct prediction
number_of_correct_prediction = number_of_correct_prediction + 1
accuracy = number_of_correct_prediction/number_of_test_data * 100
print('Accurcy in test Dataset:')
print(accuracy)
def kmeans_algorithm(train_with_label,train_without_label,Centroids):
num_iter=100
Output={} #output that will store each point in a cluster
for n in range(100):
EuclidianDistance=np.array([]).reshape(m,0)
for k in range(K):
tempDist=np.sum((train_without_label-Centroids[:,k])**2,axis=1)
EuclidianDistance=np.c_[EuclidianDistance,tempDist]
C=np.argmin(EuclidianDistance,axis=1) # cluster for each point based on shortest distance from a centroid
#Calculating new mean
Z={}
for k in range(K):
Z[k]=np.array([]).reshape(no_of_features,0)
Output[k]=np.array([]).reshape(no_of_features+1,0)
for i in range(m):
Z[C[i]]=np.c_[Z[C[i]],train_without_label[i]]
Output[C[i]]=np.c_[Output[C[i]],train_with_label[i]]
for k in range(K):
Z[k]=Z[k].T
for k in range(K):
Centroids[:,k]=np.mean(Z[k],axis=0)
return Output
train_with_label,train_without_label,test_data,test_label = test_train_splitter(optdigits_df)
m = train_without_label.shape[0] #no. of points in training dataset
no_of_features = train_without_label.shape[1] #number of features
K=10 #number of clusters
Centroids = centroid_initialization(train_with_label,train_without_label)
Output = kmeans_algorithm(train_with_label,train_without_label,Centroids)
visualize_50_digits_in_cluster(Output)
print_number_in_all_cluster(Output)
#This accuracy is without applying PCA to the dataset. It is by directly applying Kmeans classifier to the dataset
measuring_accuracy(Centroids,test_data,test_label)
#Now Applying PCA to the dataset and hence finding the features that explain 90% variance
def normalising_function(X):
X_standard=StandardScaler().fit_transform(X) #Scaling the dataset so that the range of values will be the same for all columns and thus helps in computation of values
X_mean=np.mean(X_standard,axis=0) #Finding the mean
X_cov=(X_standard - X_mean).T.dot((X_standard - X_mean))/(X_standard.shape[0]-1) #Finding the covariance matrix
return X_standard,X_mean,X_cov #returning the calculted matrices
def Principal_Component_Analysis(X):
X_standard,X_mean,X_cov=normalising_function(X) #Storing the retunned matrices from the normalising_function()
X_standard=X_standard-X_standard.mean(axis=0) #Subtracting the mean of each column from the dataset
fig=plt.figure(figsize=(10,10))
sns.heatmap(pd.DataFrame(X_cov)) #Creating a heatmap of the covariance matrix
plt.show()
eigenvalues, eigenvectors = np.linalg.eig(X_cov) #Computing the eigen values and eigen vectors from the covariance matrix
#The eigen vector corresponding to an eigen value is made into a pair and stored
unsorted_eigen_value_vector_pair=[(np.abs(eigenvalues[i]),eigenvectors[:,i]) for i in range(len(eigenvalues))]
#Then it is sorted in the descreasing order and we get out eigen value-vector pair
sorted_eigen_value_vector_pair=sorted(unsorted_eigen_value_vector_pair, reverse=True, key=lambda x:x[0])
#Plotting a heatmap of the eigen vector pairs
#This helps in visualing which eigen vectors are more prominent
#As the more prominent value in the heatmap is the one which has greater intensity
fig=plt.figure(figsize=(15,4))
sns.heatmap(pd.DataFrame([pair[1] for pair in sorted_eigen_value_vector_pair[0:21]]),annot=False,cmap='coolwarm',vmin=-0.5,vmax=0.5)
plt.ylabel("Ranked Eigen Values")
plt.xlabel("Eigen Vector Components")
plt.show()
#We are calculating the variance explained by each eigen value
lam_sum=sum(eigenvalues)
explained_variance=[(lam_k/lam_sum) for lam_k in sorted(eigenvalues,reverse=True)]
#Cumulatively a certain number of features will help explain the entire dataset
#Here we are creating a scree graph to show the explained variance ratio
plt.figure(figsize=(6,4))
plt.bar(range(len(explained_variance)),explained_variance,alpha=0.5,align='center',label='Individual Explained variance $\lambda_{k}$')
plt.ylabel('Explained variance ratio')
plt.xlabel('Ranked Principal Components')
plt.title('Scree Graph')
plt.legend(loc='best')
plt.tight_layout()
#This graph is cumulatively showing the variance explained by the features
#As required in the question we can see from the graph that around 0.9 of variance is explained by around 30-35 of the features
fig = plt.figure(figsize=(6,4))
ax1 = fig.add_subplot(111)
ax1.plot(np.cumsum(explained_variance))
ax1.set_ylim([0,1.0])
ax1.set_xlabel('Number of Principal Components')
ax1.set_ylabel('Cumulative explained variance')
ax1.set_title('Explained Variance')
plt.show()
#We are printing the column indexes and the variance they explain
#Once again this is a cumulative value
#This is done to solidify our assumption that the principal components that explain 90% of variance are actually the first 32 eigen value-vector pairs
print("\nCumulative variance explained by the features is shown here:")
print([(j, np.cumsum(explained_variance)[j]) for j in range(len(explained_variance[:64]))])
print("\nChoosing 5 Principal Components explains : ",np.cumsum(explained_variance)[5], "% of variance")
print("Choosing 25 Principal Components explains : ",np.cumsum(explained_variance)[25], "% of variance")
print("Choosing 32 Principal Components explains : ",np.cumsum(explained_variance)[32], "% of variance\n")
#Finally we are creating our W.Transpose matrix of the most important components
#This will be multiplied with the dataset to reduce the features (because we are recasting or projecting the original dataset onto these principal components)
matW = np.hstack( pair[1].reshape(64,1) for pair in sorted_eigen_value_vector_pair[0:32])#[0:4] originally
print("\nShape of W.T matrix : ",matW.shape)
#Multipling and returning the reduced features matrix
Z = X_standard.dot(matW)
print("Shape of returned features : ",Z.shape)
return Z
reduced_data_features=Principal_Component_Analysis(data_features)
pca_reduced_dataset = pd.DataFrame(reduced_data_features)
pca_reduced_dataset['label'] =data_labels
train_with_label,train_without_label,test_data,test_label = test_train_splitter(pca_reduced_dataset)
m = train_without_label.shape[0] #no. of points in training dataset
no_of_features = train_without_label.shape[1] #number of features
K=10 #number of clusters
Centroids = centroid_initialization(train_with_label,train_without_label)
Output = kmeans_algorithm(train_with_label,train_without_label,Centroids)
print_number_in_all_cluster(Output)
#This is the accuracy obtained by applying PCA to the dataset and then applying the Kmeans classifier
measuring_accuracy(Centroids,test_data,test_label)