-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclustering.py
More file actions
150 lines (113 loc) · 5.09 KB
/
clustering.py
File metadata and controls
150 lines (113 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
BS1819-1617
Data Structures and Algorithms
Group Assignment-Part 3
Team 3:
- Ahmad Bilal Aslam
- Chris Ying
- Christina Lefkothea Tatli
- Joh B
- Kelvin Goh
- Selly Salkha
"""
import sys
import networkx as nx
import matplotlib.pyplot as plt
##### START OF PART 3 - clustering algorithm #####
##### Add all pairwise correlation, into a list and sort them in ascending order
def SortEdges(correlationTable):
comp=correlationTable.columns
#Initialize the list of edges where each element is a tuple (weight,source,destination)
corrTable=[]
for i in range(0,correlationTable.shape[0]-1):
for j in range(i+1,correlationTable.shape[0]):
corrTable.append((printCor(correlationTable, comp[i],comp[j],compData)[2],comp[i],comp[j]))
corrTable=sorted(corrTable, key=lambda x:x[0])
return corrTable
corrTable=SortEdges(correlationTable)
#Divide the graph into clusters, based on edge weight(correlation)
def clustering(corrTable,k,companies):
#Initialize a dictionary called nodePointers where the keys are each node, with the corresponding value the node itself
nodePointers=dict([ (p, p) for p in companies ])
#Initialize graph G for visualization
G = nx.Graph()
#Repeat the algorithm k times
for i in range(1,k+1):
#Pick the highest-weight edge ("greedy" choice)
edge=corrTable[-i]
source=edge[1]
dest=edge[2]
#Merge the sets containing the source and the destination of the edge
while nodePointers[source]!=source:
source=nodePointers[source]
while nodePointers[dest]!=dest:
dest=nodePointers[dest]
#Update the dictionary
nodePointers[source]=dest
#Test if source and destination nodes are not aldready in the G graph and add them
if source not in G.nodes():
G.add_node(source)
if dest not in G.nodes():
G.add_node(dest)
#Test if the edge (source,dest) is not aldready in the G graph and add it
if (source,dest) not in G.edges():
G.add_edge(source,dest)
return nodePointers,G
nodePointer,G=clustering(corrTable,20,correlationTable.columns)
#Recover the sets, creating a dictionary with indices of the clusters/sets as keys and nodes in each cluster as values of each element of the dictionary
def clusterRecover(nodePointer):
nodeList={}
flag=1
#Initialize a dictionary with all different nodes as keys and value 0 if they hadn't been visited,value 1 if visited
visited={k:0 for k in nodePointer.keys()}
for i in nodePointer.keys():
#For each node, if it and its neighbor is not visited, add the node and the neighbor to a new cluster
if visited[nodePointer[i]]==0 and visited[i]==0:
nodeList[flag]=[]
nodeList[flag].append(i)
visited[i]=flag
if nodePointer[i]!=i:
nodeList[flag].append(nodePointer[i])
visited[nodePointer[i]]=flag
flag+=1
#Else If the neighbor is visited, but the node is not visited, add the node to the cluster of the neighbor
elif visited[nodePointer[i]]!=0 and visited[i]==0:
nodeList[visited[nodePointer[i]]].append(i)
visited[i]=visited[nodePointer[i]]
#Else if the node is visited and the neighbor is not visited, add the neighbor to the cluster of the node
elif visited[nodePointer[i]]==0 and visited[i]!=0:
nodeList[visited[i]].append(nodePointer[i])
visited[nodePointer[i]]=visited[i]
#Else if node and its neighbor are in different clusters,merge them
else:
if visited[nodePointer[i]]!=visited[i]:
temp=visited[nodePointer[i]]
for j in nodeList[visited[nodePointer[i]]]:
print(j)
nodeList[visited[i]].append(j)
visited[j]=visited[i]
del nodeList[temp]
return nodeList
nodeList=clusterRecover(nodePointer)
#Create a list of different colors of size n expressed in color codes
def get_spaced_colors(n):
max_value = 16581375 #255**3
interval = int(max_value / n)
colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
colors_=[(int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16)) for i in colors]
return ['#%02x%02x%02x' % i for i in colors_[0:n]]
colors=get_spaced_colors(len(nodeList))
#Assign a color to each node based on each cluster
def nodecoloring(nodeList,colors):
z={}
flag=0
for i in nodeList.values():
color=colors[flag]
for j in i:
z[j]=color
flag+=1
return z
z=nodecoloring(nodeList,colors)
nx.draw_shell(G, node_color=[z[node] for node in G] ,with_labels = True)
plt.show()