-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkMeans.py
90 lines (70 loc) · 3 KB
/
kMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
np.random.seed(42)
data = pd.read_csv('../datasets/iris/Iris.csv')
feature_set = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
features = data[feature_set]
clusters=3
# print(features.shape[1])
# # print(centeroids)
def init_centeroids(clusters, features):
centeroids = np.zeros([clusters, features.shape[1]]).transpose()
feature_max = features.loc[features[:].idxmax()].max()
for idx,value in enumerate(feature_max):
centeroids[idx] = np.random.randint(1, feature_max[idx], clusters)
centeroids = centeroids.transpose()
return centeroids
def compute_distance(features, clusters, centeroids):
distance = np.zeros([features.shape[0], clusters])
for i in range(clusters):
distance[:, i] = norm(features - centeroids[i,:], axis=1)
return distance
def within_cluter_sse(features, centeroids, labels):
distances = np.zeros(features.shape[0])
for i in range(clusters):
distances[labels == i] = norm(features[i == labels] - centeroids[i], axis=1)
return np.sum(np.absolute(distances))
def get_labels(distances):
return np.argmin(distances, axis=1)
def update_centeroid(features, distances, clusters, labels):
centeroids = np.zeros([clusters, features.shape[1]])
for k in range(clusters):
# print(np.mean(features[k == lables], axis=0))
centeroids[k, :] = np.mean(features[labels == k], axis= 0)
# print(within_cluter_sse(features,centeroids, labels))
return centeroids
def centeroid_distance_change(old, new):
return np.absolute(new - old)
def plot_data(features, centeroids, labels=None):
# fig = plt.figure(figsize=(5,5))
plt.clf()
colormap = [np.random.rand(3,) for i in range(clusters)]
if type(labels).__module__ != 'numpy':
plt.scatter(features.iloc[:,0], features.iloc[:,1], color='k')
else:
for i, value in enumerate(np.array([features.iloc[:,0], features.iloc[:,1]]).transpose()):
plt.scatter(value[0], value[1], color=colormap[labels[i]])
for i,point in enumerate(centeroids):
plt.scatter(point[0], point[1], color=colormap[i], marker='*', s=200, edgecolor='k')
plt.show()
def fit(features, clusters, sensivity = 0.001):
centeroids = init_centeroids(clusters, features)
plot_data(features, centeroids)
i = 0
while True:
distances = compute_distance(features,clusters, centeroids)
labels = get_labels(distances)
centeroids_new = update_centeroid(features, distances, clusters, labels)
# print(centeroid_distance_change(centeroids, centeroids_new))
if((centeroid_distance_change(centeroids, centeroids_new)).any() <= sensivity):
print('finished')
break
centeroids = centeroids_new
if(i % 3 == 0):
plot_data(features, centeroids, labels)
i += 1
plot_data(features, centeroids, labels)
return centeroids
fit(features, clusters)