-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_means.py
47 lines (44 loc) · 2.01 KB
/
k_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# dataSet样本点,k 簇的个数
# disMeas距离量度,默认为欧几里得距离
# createCent,初始点的选取
import numpy
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float,curLine) #map all elements to float()
dataMat.append(fltLine)
return dataMat
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#create centroid mat
for j in range(n):#create random cluster centers, within bounds of each dimension
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = dataSet.shape[0] #样本数
clusterAssment = mat(zeros((m,2))) #m*2的矩阵
centroids = createCent(dataSet, k) #初始化k个中心
clusterChanged = True
while clusterChanged: #当聚类不再变化
clusterChanged = False
for i in range(m):
minDist = inf; minIndex = -1
for j in range(k): #找到最近的质心
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True
# 第1列为所属质心,第2列为距离
clusterAssment[i,:] = minIndex,minDist**2
print(centroids)
# 更改质心位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:] = mean(ptsInClust, axis=0)
return centroids, clusterAssment