AutoNorm and K-means

jimenbian · Jul 11, 2014 · 8e78c76 · 8e78c76
1 parent 2b5206a
commit 8e78c76
Show file tree

Hide file tree

Showing 6 changed files with 1,206 additions and 0 deletions.
diff --git a/AutoNormal/AutoNorm.py b/AutoNormal/AutoNorm.py
@@ -0,0 +1,84 @@
+
+from __future__ import division
+def GetAverage(mat):
+
+    n=len(mat)
+    m= width(mat) 
+    num = [0]*m
+    for j in range(0,m): 
+           for i in mat:
+              num[j]=num[j]+i[j]           
+           num[j]=num[j]/n   
+    return num
+
+def width(lst):
+    i=0
+    for j in lst[0]:
+       i=i+1
+    return i
+
+def GetVar(average,mat):    
+    ListMat=[]
+    for i in mat:    
+        ListMat.append(list(map(lambda x: x[0]-x[1], zip(average, i))))
+
+    n=len(ListMat)
+    m= width(ListMat) 
+    num = [0]*m
+    for j in range(0,m): 
+        for i in ListMat:
+                  num[j]=num[j]+(i[j]*i[j])       
+        num[j]=num[j]/n   
+    return num 
+
+def DenoisMat(mat):
+    average=GetAverage(mat)
+    variance=GetVar(average,mat)
+    section=list(map(lambda x: x[0]+x[1], zip(average, variance)))    
+
+    n=len(mat)
+    m= width(mat) 
+    num = [0]*m
+    denoisMat=[]    
+    for i in mat:
+        for j in range(0,m):
+               if i[j]>section[j]:
+                     i[j]=section[j]
+        denoisMat.append(i)  
+    return denoisMat                
+
+def AutoNorm(mat):   
+    n=len(mat)
+    m= width(mat)     
+    MinNum=[9999999999]*m
+    MaxNum = [0]*m    
+    for i in mat:
+        for j in range(0,m):
+            if i[j]>MaxNum[j]:
+                MaxNum[j]=i[j]
+
+    for p in mat:     
+        for q in range(0,m):
+            if p[q]<=MinNum[q]:
+                    MinNum[q]=p[q]  
+
+    section=list(map(lambda x: x[0]-x[1], zip(MaxNum, MinNum)))
+    print section
+    NormMat=[]
+
+    for k in mat:     
+
+          distance=list(map(lambda x: x[0]-x[1], zip(k, MinNum)))
+          value=list(map(lambda x: x[0]/x[1], zip(distance,section)))
+          NormMat.append(value)           
+    return NormMat        
+
+if __name__=='__main__':
+    mat=[[1,42,512],[4,5,6],[7,8,9],[2,2,2],[2,10,5]]
+    a=GetAverage(mat)
+    b=GetVar(a,mat)
+    print a,
+    print DenoisMat(mat)
+
+#     print list(map(lambda x: x[0]-x[1], zip(v2, v1))) 
+    print AutoNorm(mat)
diff --git a/K-means/K-means/.project b/K-means/K-means/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>K-means</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>
diff --git a/K-means/K-means/.pydevproject b/K-means/K-means/.pydevproject
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?>
+
+<pydev_project>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/K-means/src</path>
+</pydev_pathproperty>
+</pydev_project>
diff --git a/K-means/K-means/src/Test.py b/K-means/K-means/src/Test.py
@@ -0,0 +1,95 @@
+'''
+@author: hakuri
+'''
+from numpy import *
+import matplotlib.pyplot as plt
+def loadDataSet(fileName):      #general function to parse tab -delimited floats
+    dataMat = []                #assume last column is target value
+    fr = open(fileName)
+    for line in fr.readlines():
+        curLine = line.strip().split('\t')
+        fltLine = map(float,curLine) #map all elements to float()
+        dataMat.append(fltLine)
+    return dataMat
+
+def distEclud(vecA, vecB):
+    return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
+
+def randCent(dataSet, k):
+    n = shape(dataSet)[1]
+    centroids = mat(zeros((k,n)))#create centroid mat
+    for j in range(n):#create random cluster centers, within bounds of each dimension
+        minJ = min(array(dataSet)[:,j])
+
+        rangeJ = float(max(array(dataSet)[:,j]) - minJ)
+        centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
+
+    return centroids
+
+def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
+    m = shape(dataSet)[0]
+    clusterAssment = mat(zeros((m,2)))#create mat to assign data points                                       #to a centroid, also holds SE of each point
+    centroids = createCent(dataSet, k)
+    clusterChanged = True
+    while clusterChanged:
+        clusterChanged = False
+        for i in range(m):#for each data point assign it to the closest centroid
+            minDist = inf; minIndex = -1
+            for j in range(k):
+                distJI = distMeas(array(centroids)[j,:],array(dataSet)[i,:])
+                if distJI < minDist:
+                    minDist = distJI; minIndex = j
+            if clusterAssment[i,0] != minIndex: clusterChanged = True
+            clusterAssment[i,:] = minIndex,minDist**2
+        print centroids
+#         print nonzero(array(clusterAssment)[:,0]
+        for cent in range(k):#recalculate centroids
+                ptsInClust = dataSet[nonzero(array(clusterAssment)[:,0]==cent)[0][0]]#get all the point in this cluster
+
+                centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean 
+    id=nonzero(array(clusterAssment)[:,0]==cent)[0] 
+    return centroids, clusterAssment,id
+
+def plotBestFit(dataSet,id,centroids):  
+
+    dataArr = array(dataSet)
+    cent=array(centroids)
+    n = shape(dataArr)[0] 
+    n1=shape(cent)[0]
+    xcord1 = []; ycord1 = []
+    xcord2 = []; ycord2 = []
+    xcord3=[];ycord3=[]
+    j=0
+    for i in range(n):
+        if j in id:
+            xcord1.append(dataArr[i,0]); ycord1.append(dataArr[i,1])
+        else:
+            xcord2.append(dataArr[i,0]); ycord2.append(dataArr[i,1])
+        j=j+1 
+    for k in range(n1):
+          xcord3.append(cent[k,0]);ycord3.append(cent[k,1])    
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
+    ax.scatter(xcord2, ycord2, s=30, c='green')
+    ax.scatter(xcord3, ycord3, s=50, c='black')
+
+    plt.xlabel('X1'); plt.ylabel('X2');
+    plt.show()    
+
+
+if __name__=='__main__':
+    dataSet=loadDataSet('/Users/hakuri/Desktop/testSet.txt')
+# #     print randCent(dataSet,2)
+#      print dataSet
+#      
+#      print  kMeans(dataSet,2)
+    a=[]
+    b=[]
+    a, b,id=kMeans(dataSet,2)
+    plotBestFit(dataSet,id,a)
+
+
+
+
diff --git a/K-means/effect.png b/K-means/effect.png