-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplottools.py
380 lines (327 loc) · 15.6 KB
/
plottools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#vim: set encoding=utf-8
"""Plot tools: Reusable utilities for working with matplotlib plots.
Contents:
cb_colors Easily distinguishable color sets
cb_color_cycle Color cycle for the above, to use in plots
Functions:
plot_smoothed Plot a time series smoothed with a cosine kernel
plot_acorr Plot the autocorrelation of a timeseries
thin_points Thin data points that are too close to each other
thin_transformed Thin data points in a transformed (e.g. log) space
scatter_thin_points Plot a set of thinned data points with weight info
scatter_mark_outliers
Mark outliers at the edges of a scatterplot
scatter_outliers_size
Make a scatterplot with sizes and outliers
Copyright © 2018 Max Veit.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from __future__ import unicode_literals, print_function, division
import logging
import matplotlib as mpl
from matplotlib import pyplot
import numpy as np
from scipy import signal
from scipy.spatial import KDTree
logger = logging.getLogger(__name__)
# Some colours for easily distinguishable and colourblind-friendly coding
# Use the 4th colour if necessary; may not be colourblind-friendly though
# from colorbrewer.org
cb_colors = {}
cb_colors['green'] = '#1b9e77'
cb_colors['orange'] = '#d95f02'
cb_colors['purple'] = '#7570b3'
cb_colors['pink'] = '#e7298a'
cb_color_cycle = mpl.rcsetup.cycler(
'c', [cb_colors[key] for key in ['purple', 'orange', 'green', 'pink']])
def plot_smoothed(time, data, avg_period, ax=None, **plot_args):
"""Plot a time series smoothed with a cosine kernel
Parameters:
time Array of equally spaced time points
data Array of data to plot at those times
avg_period Number of time points to average over
(width of the averaging kernel)
ax Axes to plot into (default: current/new Axes)
plot_args Any additional arguments to pass to pyplot.plot()
"""
logger.info("Averaging period: {:g} fs".format(
avg_period * (time[1] - time[0])))
#krnl = np.ones(avg_period) / avg_period
krnl = signal.cosine(avg_period)
krnl = krnl / np.sum(krnl)
data_ma = np.convolve(data, krnl, mode='valid')
if ax is None:
ax = pyplot.gca()
ax.plot(time[avg_period/2:-(avg_period/2 - 1)], data_ma, **plot_args)
def plot_acorr(time, data, step_start, sample_dt, corr_guess, axes=None,
**plot_args):
"""Plot the autocorrelation of a timeseries and show its integral
The integral is useful for estimating the error on the mean of a
correlated timeseries, see Sokal's lecture on Monte Carlo methods for
more information.
Parameters:
time Time values of the timeseries
Must be sequential and equally spaced
data Data of the timeseries
step_start Step (array index) at which to start computing
averages and correlation
sample_dt Time between two successive samples
corr_guess Guess for the correlation time of the series
Optional parameters:
axes Matplotlib axes to plot into. Leaving it as None will
plot into the current axes (or create a new one if none
exists yet)
Remaining keyword arguments are passed to the plot() command used to
create the autocorrelation trace.
Returns:
mean Mean of data in range (after `step_start`)
serr Standard error on mean
(accounting for autocorrelation)
n_eff Number of effective independent samples
corr_time Integrated correlation time
"""
logger.info("Starting from time {:.2f}".format(time[step_start]))
data_mean = np.mean(data[step_start:])
data_zeromean = data[step_start:] - np.mean(data[step_start:])
n = len(data_zeromean)
dataacorr = (signal.correlate(data_zeromean, data_zeromean, 'same')
/ (n - np.abs(np.arange(n) - n//2)))
window_width = int(corr_guess * 6.0 / sample_dt)
acorr_int = 0.5 * np.sum(
dataacorr[n//2 - window_width : n//2 + window_width + 1]) * sample_dt
corr_time = acorr_int / dataacorr[n//2]
serr_mean = np.sqrt(2.0 * acorr_int / n / sample_dt)
n_eff = n / 2 / corr_time * sample_dt
if axes is None:
axes = pyplot.gca()
axes.plot(time[step_start:] - time[n//2 + step_start],
dataacorr)
axes.grid()
axes.axvspan(-1.*time[window_width], time[window_width], fc='k', alpha=0.2,
label="Integration region")
axes.axvspan(-1.*corr_time, corr_time, fc='r', alpha=0.3,
label="Correlation time")
axes.set_xlim(-12.0 * corr_guess, 12.0 * corr_guess)
return data_mean, serr_mean, n_eff, corr_time
def thin_points(data, r=None, nmax=1, density=None, len_scale=0.01):
"""Thin a set of data points down to some target density.
Uses "poisonous seedling" thinning algorithm: Pick a point at
random, and if it has more than a certain number of neighbours
within some predefined radius, randomly remove neighbours until the
number is reached. Repeat in turn for all the remaining points.
If the data has an extreme aspect ratio (either the ratio or its
reciprocal greater than about 1.5), or is to be plotted with one or
both axes on a log scale, the function thin_transformed() will usually
produce better results (NB only for data in 2-D space).
Parameters:
data Data matrix, size (N,d) - N is num points, d is problem
dimensionality
Specify either:
density Target density, maximum density in thinned set
with optional:
len_scale Length scale of density averaging, as a fraction of the
range in the data (maximum range along any dimension).
Default 0.01.
or:
r Absolute length scale of distance averaging (radius of
sphere within which neighbours are counted)
with optional:
nmax Maximum number of neighbours for a point (default 1)
Note that the default of nmax=1 ensures that no two points are
closer than r, thinning points to a maximum (approximately uniform)
density of 1/r^dim. This mode of use seems to produce the best
results in practice.
NB density only implemented in two dimensions (i.e. d==2).
"""
dim = data.shape[1]
if density is None and r is None:
raise ValueError("Must specify r if not using density")
elif density is not None and dim != 2:
raise NotImplementedError("Density not implemented for dimension != 2")
elif density is not None:
# Choose nmax and r such that r is about 1/100 of the range of
# the data, but make nmax an integer.
data_range = np.max(np.max(data, axis=0) - np.min(data, axis=0))
nmax = np.ceil(density * np.pi * (data_range * len_scale)**2)
r = np.sqrt(nmax / (density * np.pi))
# Technically the leafsize should be equal to the ratio of volumes of the
# d-cube to the d-sphere times nmax, but I think this is good enough.
neightree = KDTree(data, leafsize=max(20, int(nmax) * 2))
deleted = np.zeros((data.shape[0], ), dtype=bool)
point_weight = np.ones((data.shape[0], ), dtype=np.float32)
idces = np.arange(data.shape[0])
np.random.shuffle(idces)
for data_idx in idces:
if deleted[data_idx]:
continue
# TODO adjust eps for efficiency?
neighbours = neightree.query_ball_point(data[data_idx,:], r, eps=0.0)
neighbours.remove(data_idx)
if nmax == 1:
del_idces = neighbours
else:
del_idces = np.random.choice(neighbours,
len(neighbours) - nmax + 1)
deleted[del_idces] = True
point_weight[data_idx] += np.sum(point_weight[del_idces])
point_weight[del_idces] = 0
return data[~deleted,:], point_weight[~deleted]
def thin_transformed(data_full, equalize_aspect=True, aspect=None,
do_y_log=False, do_x_log=False, **thin_params):
"""Apply the thinning algorithm in a transformed 2-D data space
Data points are always returned in the original (untransformed)
space
Parameters:
equalize_aspect Equalize the aspect ratio (decided by the
range of the data, or aspect if given)
before thinning
aspect Manually specify aspect ratio (width / height)
of the data (or log-data if thinning in log space)
do_y_log Take the log of the data (y-coordinates)
before thinning
do_x_log Take the log of the data (x-coordinates)
before thinning
Other keyword arguments are passed on to thin_points(). Note in
particular that the 'r' parameter is in X axis units.
"""
if data_full.shape[1] != 2:
raise ValueError("This function is designed to work in 2-D space; "
"got data.shape[1] == {:d} instead.".format(
data_full.shape[1]))
data_trans = data_full.copy()
if do_y_log:
data_trans[:,1] = np.log(data_trans[:,1])
if do_x_log:
data_trans[:,0] = np.log(data_trans[:,0])
if equalize_aspect and (aspect is None):
data_range = np.max(data_trans, axis=0) - np.min(data_trans, axis=0)
aspect = data_range[0] / data_range[1]
if equalize_aspect:
data_trans[:,1] = data_trans[:,1] * aspect
data_thin, weights = thin_points(data_trans, **thin_params)
# Undo the transformations
if equalize_aspect:
data_thin[:,1] = data_thin[:,1] / aspect
if do_x_log:
data_thin[:,0] = np.exp(data_thin[:,0])
if do_y_log:
data_thin[:,1] = np.exp(data_thin[:,1])
return data_thin, weights
def scatter_thin_points(data_thin, weights, method='size', ax=None,
**plot_args):
"""Make a scatterplot with a thinned set of points
Paramters:
data_thin Thinned data, e.g. from thin_points()
weights Weights of the thinned data, e.g. from thin_points()
(represents how many real points each thin point
represents
method How to represent the weights; options are 'size',
'color', and 'alpha'
ax Existing axis to plot into, if any
Any remaining keyword arguments are passed to pyplot.scatter().
"""
if ax is None:
ax = pyplot.gca()
if method == 'size':
# TODO An automatic determination of the base size based on the
# actual plot size (in pixels) might be nice...
s_scale = plot_args.pop('s', 2)
# Retain the border (linewidth) to make it easier to see points' sizes
pl = ax.scatter(data_thin[:,0], data_thin[:,1],
s=(weights * s_scale), **plot_args)
elif method == 'color':
plot_args.pop('c', None)
pl = ax.scatter(data_thin[:,0], data_thin[:,1], linewidth=0, c=weights,
**plot_args)
elif method == 'alpha':
alpha_min = 0.0
c_base = plot_args.pop('c', 'k')
c_base = matplotlib.colors.colorConverter.to_rgba(c_base)
c_array = np.zeros((len(weights), 4))
c_array[:,:] = c_base
c_array[:,3] = 1.0 + (alpha_min - 1.0) * np.exp(-1.0 * weights)
pl = ax.scatter(data_thin[:,0], data_thin[:,1], linewidth=0, c=c_array,
**plot_args)
else:
raise ValueError("Unknown representation method " + method)
return pl
def scatter_mark_outliers(xs, ys, xlim, ylim, ax=None, **plot_args):
"""Mark outliers at the edges of a scatterplot
Paramters:
xs List of x-coordinates of points, passed to pyplot.scatter()
ys List of y-coordinates of points, passed to pyplot.scatter()
xlim Limits of x-axis, as a tuple (xmin, xmax)
ylim Limits of y-axis, as a tuple (ymin, ymax)
ax Existing axis to plot into, if any
Any other keyword arguments are passed to pyplot.scatter().
The outliers are marked at the edge of the graph as large red dots,
by default size 40, color 'r'. If a label was present, the outliers
will be labelled as the same with " (outliers)" appended.
Returns the collection of scatterplots - (top, bottom, left, right).
"""
if ax is None:
ax = pyplot.gca()
outl_top = xs[(ys > ylim[1]) & (xs >= xlim[0]) & (xs <= xlim[1])]
outl_bot = xs[(ys < ylim[0]) & (xs >= xlim[0]) & (xs <= xlim[1])]
# Let out_left and outl_right handle the (literal) corner cases
outl_left = np.array(ys[(xs < xlim[0])])
outl_right = np.array(ys[(xs > xlim[1])])
outl_left[outl_left < ylim[0]] = ylim[0]
outl_left[outl_left > ylim[1]] = ylim[1]
outl_right[outl_right < ylim[0]] = ylim[0]
outl_right[outl_right > ylim[1]] = ylim[1]
if 'linewidth' not in plot_args:
plot_args['linewidth'] = 0
if 'c' not in plot_args:
plot_args['c'] = 'r'
if 's' not in plot_args:
plot_args['s'] = 40
plot_args['clip_on'] = False
plot_args['label'] = (plot_args['label'] + ' (outliers)'
if 'label' in plot_args else '_')
op_t = ax.scatter(outl_top, ylim[1] * np.ones_like(outl_top), **plot_args)
plot_args['label'] = '_'
op_b = ax.scatter(outl_bot, ylim[0] * np.ones_like(outl_bot), **plot_args)
op_l = ax.scatter(
xlim[0] * np.ones_like(outl_left), outl_left, **plot_args)
op_r = ax.scatter(
xlim[1] * np.ones_like(outl_right), outl_right, **plot_args)
# Not typically necessary, but just ensure the outlier markers end
# up at the edge of the plot
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
return op_t, op_b, op_l, op_r
def scatter_outliers_size(data_thin, weights, xlim, ylim, ax=None, c_out='r',
s_out=40, **plot_args):
"""Do scatterplot with outliers, encoding the given weights as size.
NB This doesn't change the size of the outlier markers - it might be
possible by modifying the original escatter_outliers() function, but
does it even make sense to encode the weights of outliers?
Parameters:
data_thin Thinned data, e.g. from thin_points()
weights Weights of the thinned data, e.g. from thin_points()
(represents how many real points each thin point
represents
xlim Limits of x-axis, as a tuple (xmin, xmax)
ylim Limits of y-axis, as a tuple (ymin, ymax)
ax Existing axis to plot into, if any
c_out Colour to use to mark outliers, default 'r'
s_out Size for outlier markers, default 40 (px)
Any remaining keyword arguments will only modify the properties of
the central (non-outlier) scatterplot.
Returns the original plot as well as the four outlier scatterplots -
(orig, top, bottom, left, right)
"""
pl = scatter_thin_points(data_thin, weights, ax=ax, **plot_args)
outps = scatter_mark_outliers(
*data_thin.T, xlim=xlim, ylim=ylim, ax=ax, c=c_out, s=s_out)
return (pl,) + outps