-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvisualize.py
208 lines (168 loc) · 8.4 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
## Author: Scott Emmons
## Purpose: To visualize results of the clustering analysis workflow.
## Date: February 17, 2014
import argparse
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
####################
# Global variables #
####################
out_directory = 'generated_visualizations/'
compiled_data = out_directory + 'compiled_data.csv'
def handleArgs():
"""Handle command-line input arguments."""
# Collect user input of raw data files to analyze, their corresponding names, and the number of nodes that corresponds to each file
parser = argparse.ArgumentParser(description="Visualize the summary statistics of the clustering analysis workflow")
parser.add_argument("-f", "--files", nargs="+", required=True, help="the raw data files to visualize", dest="raw_data_file_paths")
parser.add_argument("--names", nargs="+", required=True, help="the name of each raw data file, in corresponding order", dest="raw_data_file_names")
parser.add_argument("--nodes", nargs="+", type=int, required=True, help="the number of nodes in each of the indicated raw data files, in corresponding order", dest="raw_data_nodes")
global args
args = parser.parse_args()
# Ensure that the given raw data files, their names, and corresponding number of nodes are equal in length
if len(args.raw_data_file_paths) != len(args.raw_data_file_names) != len(args.raw_data_nodes):
print 'the length of -f (--files) must match that of --names and --nodes'
assert False
def makeDirIfNeeded(directory):
""""""
try:
os.mkdir(directory)
except OSError:
pass
def compileData(file_paths, destination_path, headers = True):
"""Write data from multiple data files into one file."""
write_header = headers
makeDirIfNeeded(out_directory)
destination = open(destination_path, 'wb')
for path in file_paths:
source = open(path, 'r')
# Skip column names for all but first data file
if not write_header:
source.readline()
else:
write_header = False
# for line in source.readline():
for line in source:
destination.write(line)
source.close()
destination.close()
status_message = '\nSuccessfully compiled data from file(s)'
for path in file_paths:
status_message = status_message + ' ' + path + ','
print status_message[:-1]
def runIndividualVisualization(file_path, file_name):
"""
Visualize the individual metrics in file_path, using file_name to name the output.
Dependencies:
pandas Python library
Input variables:
file_path - a string specifying the path to the file containing the data to visualize
file_name - a string specifying the name of the file to use for function output naming
Output:
writes a file visualizing the metrics contained in file_path
"""
# Create a data frame of the data stored in the file
df = pd.read_csv(file_path)
# For each clustering algorithm, visualize performance by metric
algorithms = df['name'].unique()
for algorithm in algorithms:
algorithm_frame = df[df['name'] == algorithm][['metric', 'value']]
boxplot = algorithm_frame.boxplot(by="metric", showmeans=True, meanline=True)
boxplot.set_title(algorithm + ' Clustering Metric Values')
boxplot.set_xlabel('')
boxplot.tick_params(axis="x", labelsize=5)
boxplot.set_ylim(-0.1, 1.1)
figure = boxplot.get_figure()
figure.suptitle(file_name)
makeDirIfNeeded(out_directory + file_name + '/')
figure.savefig(out_directory + file_name + '/' + algorithm + '_boxplot' + '.pdf')
plt.close()
grouped=df.groupby(['metric','name']).mean()
grouped.to_csv(out_directory + file_name + '/grouped_metrics.csv')
# For each evaluation metric, visualize performance by clustering algorithm
metrics = df['metric'].unique()
for metric in metrics:
metric_frame = df[df['metric'] == metric][['name', 'value']]
boxplot = metric_frame.boxplot(by="name", showmeans=True, meanline=True)
boxplot.set_title(metric + ' Performance by Algorithm')
boxplot.set_xlabel('')
boxplot.set_ylim(-0.1, 1.1)
figure = boxplot.get_figure()
figure.suptitle(file_name)
makeDirIfNeeded(out_directory + file_name + '/')
figure.savefig(out_directory + file_name + '/' + metric.lower().replace(' ', '_') + '_boxplot' + '.pdf')
plt.close()
print('\nSuccessfully ran individual visualization on data file ' + file_path)
def runCombinedVisualization(combined_data, write_subdirectory = ''):
""""""
# Create a data frame of the data stored in the file
df = pd.read_csv(combined_data)
# For each clustering algorithm, visualize performance by metric
algorithms = df['name'].unique()
for algorithm in algorithms:
algorithm_frame = df[df['name'] == algorithm][['n', 'metric', 'value']]
# Create dictionary to pass to dataframe constructor for visualization
d = {}
for metric in algorithm_frame['metric'].unique():
series = pd.Series()
metric_match = algorithm_frame['metric'] == metric
for n in algorithm_frame['n'].unique():
n_match = algorithm_frame['n'] == n
dat = algorithm_frame[metric_match & n_match]['value']
series = series.append(pd.Series([dat.mean()], index=[n]))
d[metric] = series
# Create and visualize the dataframe
vis_frame = pd.DataFrame(d)
linegraph = vis_frame.plot(marker='^')
linegraph.set_title(algorithm + ' Clustering Metric Values')
# linegraph.set_xscale('log')
linegraph.set_xlim(linegraph.get_xlim()[0] * 0.90, linegraph.get_xlim()[1] * 1.1)
linegraph.set_ylim(linegraph.get_ylim()[1] * -0.10, linegraph.get_ylim()[1] * 1.1)
figure = linegraph.get_figure()
figure.tight_layout()
makeDirIfNeeded(out_directory + write_subdirectory)
figure.savefig(out_directory + write_subdirectory + '/' + algorithm.lower().replace(' ', '_') + '_linegraph' + '.pdf')
plt.close()
# For each evaluation metric, visualize performance by clustering algorithm
grouped=df.groupby(['metric','name','n']).mean()
grouped.to_csv(out_directory + write_subdirectory + '/grouped_combined_metrics.csv')
metrics = df['metric'].unique()
for metric in metrics:
metric_frame = df[df['metric'] == metric][['n', 'name', 'value']]
# Create dictionary to pass to dataframe constructor for visualization
d = {}
for algorithm in metric_frame['name'].unique():
series = pd.Series()
algorithm_match = metric_frame['name'] == algorithm
for n in metric_frame['n'].unique():
n_match = metric_frame['n'] == n
dat = metric_frame[algorithm_match & n_match]['value']
series = series.append(pd.Series([dat.mean()], index=[n]))
d[algorithm] = series
# Create and visualize the dataframe
vis_frame = pd.DataFrame(d)
linegraph = vis_frame.plot(marker='^')
linegraph.set_title(metric + ' Performance by Algorithm')
# linegraph.set_xscale('log')
linegraph.set_xlim(linegraph.get_xlim()[0] * 0.90, linegraph.get_xlim()[1] * 1.1)
linegraph.set_ylim(-0.1, 1.1)
figure = linegraph.get_figure()
figure.tight_layout()
makeDirIfNeeded(out_directory + write_subdirectory)
figure.savefig(out_directory + write_subdirectory + '/' + metric.lower().replace(' ', '_') + '_linegraph' + '.pdf')
plt.close()
print('\nSuccessfully ran combined visualization on data file ' + combined_data)
if __name__ == "__main__":
# Parse command-line arguments
handleArgs()
# Compile raw data file paths into single file for integrated analysis
compileData(args.raw_data_file_paths, compiled_data)
# Iterate over each raw data file and create individual visualizations
for i in xrange(len(args.raw_data_file_paths)):
file_path = args.raw_data_file_paths[i]
file_name = args.raw_data_file_names[i]
nodes = args.raw_data_nodes[i]
runIndividualVisualization(file_path, file_name)
# Visualize combined data to show how algorithms scale with n
runCombinedVisualization(compiled_data, write_subdirectory = 'combined')