-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvisualize_developing.py
228 lines (185 loc) · 9.85 KB
/
visualize_developing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
## Author: Scott Emmons
## Purpose: To visualize results of the clustering analysis workflow.
## Date: February 17, 2014
import argparse
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
####################
# Global variables #
####################
def handleArgs():
"""Handle command-line input arguments."""
# Collect user input of raw data files to analyze, their corresponding names, and the number of nodes that corresponds to each file
parser = argparse.ArgumentParser(description="Visualize the summary statistics of the clustering analysis workflow")
parser.add_argument("-f", "--files", nargs="+", required=True, help="the raw data files to visualize", dest="raw_data_file_paths")
parser.add_argument("--names", nargs="+", required=True, help="the name of each raw data file, in corresponding order", dest="raw_data_file_names")
parser.add_argument("--nodes", nargs="+", type=int, required=True, help="the number of nodes in each of the indicated raw data files, in corresponding order", dest="raw_data_nodes")
parser.add_argument("--working_dir", required=True, help="path to a benchmark", dest="benchmark_dir")
global args
args = parser.parse_args()
# Ensure that the given raw data files, their names, and corresponding number of nodes are equal in length
if len(args.raw_data_file_paths) != len(args.raw_data_file_names) != len(args.raw_data_nodes):
print 'the length of -f (--files) must match that of --names and --nodes'
assert False
def makeDirIfNeeded(directory):
""""""
try:
os.mkdir(directory)
except OSError:
pass
def compileData(file_paths, destination_path, headers = True):
"""Write data from multiple data files into one file."""
write_header = headers
makeDirIfNeeded(out_directory)
destination = open(destination_path, 'wb')
for path in file_paths:
source = open(path, 'r')
# Skip column names for all but first data file
if not write_header:
source.readline()
else:
write_header = False
# for line in source.readline():
for line in source:
destination.write(line)
source.close()
destination.close()
status_message = '\nSuccessfully compiled data from file(s)'
for path in file_paths:
status_message = status_message + ' ' + path + ','
print status_message[:-1]
def runIndividualVisualization(file_path, file_name):
"""
Visualize the individual metrics in file_path, using file_name to name the output.
Dependencies:
pandas Python library
Input variables:
file_path - a string specifying the path to the file containing the data to visualize
file_name - a string specifying the name of the file to use for function output naming
Output:
writes a file visualizing the metrics contained in file_path
"""
# Create a data frame of the data stored in the file
df = pd.read_csv(file_path)
# For each clustering algorithm, visualize performance by metric
algorithms = df['name'].unique()
for algorithm in algorithms:
algorithm_frame = df[df['name'] == algorithm][['metric', 'value']]
boxplot = algorithm_frame.boxplot(by="metric", showmeans=True, meanline=True)
for item in ([boxplot.title, boxplot.xaxis.label, boxplot.yaxis.label] + boxplot.get_xticklabels() + boxplot.get_yticklabels()):
item.set_fontsize(15)
boxplot.set_title(algorithm + ' Clustering Metric Values', fontsize=15)
boxplot.set_xlabel('')
boxplot.tick_params(axis="x", labelsize=5)
boxplot.set_ylim(-0.1, 1.1)
legend = boxplot.legend(loc = 'best', prop={'size':15})
# plt.setp(legend.get_title(),fontsize=18)
plt.xticks(rotation=30)
figure = boxplot.get_figure()
figure.suptitle(file_name)
makeDirIfNeeded(out_directory + file_name + '/')
figure.savefig(out_directory + file_name + '/' + algorithm.lower().replace(' ', '_').replace('.', '_').replace('-', '_') + '_boxplot' + '.pdf')
plt.close()
grouped=df.groupby(['metric','name']).mean()
grouped.to_csv(out_directory + file_name + '/grouped_metrics.csv')
# For each evaluation metric, visualize performance by clustering algorithm
metrics = df['metric'].unique()
for metric in metrics:
metric_frame = df[df['metric'] == metric][['name', 'value']]
boxplot = metric_frame.boxplot(by="name", showmeans=True, meanline=True)
for item in ([boxplot.title, boxplot.xaxis.label, boxplot.yaxis.label] + boxplot.get_xticklabels() + boxplot.get_yticklabels()):
item.set_fontsize(15)
boxplot.set_title(metric + ' Performance by Algorithm', fontsize=15)
boxplot.set_xlabel('')
boxplot.set_ylim(-0.1, 1.1)
legend = boxplot.legend(loc = 'best', prop={'size':15})
# plt.setp(legend.get_title(),fontsize=15)
plt.xticks(rotation=20)
figure = boxplot.get_figure()
figure.suptitle(file_name)
makeDirIfNeeded(out_directory + file_name + '/')
figure.savefig(out_directory + file_name + '/' + metric.lower().replace(' ', '_').replace('.', '_').replace('-', '_') + '_boxplot' + '.pdf')
plt.close()
print('\nSuccessfully ran individual visualization on data file ' + file_path)
def runCombinedVisualization(combined_data, write_subdirectory = ''):
""""""
# Create a data frame of the data stored in the file
df = pd.read_csv(combined_data)
# For each clustering algorithm, visualize performance by metric
algorithms = df['name'].unique()
for algorithm in algorithms:
algorithm_frame = df[df['name'] == algorithm][['n', 'metric', 'value']]
# Create dictionary to pass to dataframe constructor for visualization
d = {}
for metric in algorithm_frame['metric'].unique():
series = pd.Series()
metric_match = algorithm_frame['metric'] == metric
for n in algorithm_frame['n'].unique():
n_match = algorithm_frame['n'] == n
dat = algorithm_frame[metric_match & n_match]['value']
series = series.append(pd.Series([dat.mean()], index=[n]))
d[metric] = series
# Create and visualize the dataframe
vis_frame = pd.DataFrame(d)
linegraph = vis_frame.plot(marker='^')
# ax=linegraph
for item in ([linegraph.title, linegraph.xaxis.label, linegraph.yaxis.label] + linegraph.get_xticklabels() + linegraph.get_yticklabels()):
item.set_fontsize(18)
linegraph.set_title(algorithm + ' clustering metric scores', fontsize=18)
# linegraph.set_xscale('log')
linegraph.set_xlim(linegraph.get_xlim()[0] * 0.90, linegraph.get_xlim()[1] * 1.1)
linegraph.set_ylim(linegraph.get_ylim()[1] * -0.10, linegraph.get_ylim()[1] * 1.1)
# plt.xticks(rotation=90)
figure = linegraph.get_figure()
figure.tight_layout()
makeDirIfNeeded(out_directory + write_subdirectory)
figure.savefig(out_directory + write_subdirectory + '/' + algorithm.lower().replace(' ', '_').replace('.', '_').replace('-', '_') + '_linegraph' + '.pdf')
plt.close()
# For each evaluation metric, visualize performance by clustering algorithm
grouped=df.groupby(['metric','name','n']).mean()
grouped.to_csv(out_directory + write_subdirectory + '/grouped_combined_metrics.csv')
metrics = df['metric'].unique()
for metric in metrics:
metric_frame = df[df['metric'] == metric][['n', 'name', 'value']]
# Create dictionary to pass to dataframe constructor for visualization
d = {}
for algorithm in metric_frame['name'].unique():
series = pd.Series()
algorithm_match = metric_frame['name'] == algorithm
for n in metric_frame['n'].unique():
n_match = metric_frame['n'] == n
dat = metric_frame[algorithm_match & n_match]['value']
series = series.append(pd.Series([dat.mean()], index=[n]))
d[algorithm] = series
# Create and visualize the dataframe
vis_frame = pd.DataFrame(d)
linegraph = vis_frame.plot(marker='^')
for item in ([linegraph.title, linegraph.xaxis.label, linegraph.yaxis.label] + linegraph.get_xticklabels() + linegraph.get_yticklabels()):
item.set_fontsize(18)
linegraph.set_title(metric + ' performance by algorithm', fontsize=18)
# linegraph.set_xscale('log')
linegraph.set_xlim(linegraph.get_xlim()[0] * 0.90, linegraph.get_xlim()[1] * 1.1)
linegraph.set_ylim(-0.1, 1.1)
figure = linegraph.get_figure()
figure.tight_layout()
makeDirIfNeeded(out_directory + write_subdirectory)
figure.savefig(out_directory + write_subdirectory + '/' + metric.lower().lower().replace(' ', '_').replace('.', '_').replace('-', '_') + '_linegraph' + '.pdf')
plt.close()
print('\nSuccessfully ran combined visualization on data file ' + combined_data)
if __name__ == "__main__":
# Parse command-line arguments
handleArgs()
out_directory = args.benchmark_dir + 'generated_visualizations/'
compiled_data = out_directory + 'compiled_data.csv'
# Compile raw data file paths into single file for integrated analysis
compileData(args.raw_data_file_paths, compiled_data)
# Iterate over each raw data file and create individual visualizations
for i in xrange(len(args.raw_data_file_paths)):
file_path = args.raw_data_file_paths[i]
file_name = args.raw_data_file_names[i]
nodes = args.raw_data_nodes[i]
runIndividualVisualization(file_path, file_name)
# Visualize combined data to show how algorithms scale with n
runCombinedVisualization(compiled_data, write_subdirectory = 'combined')