-
Notifications
You must be signed in to change notification settings - Fork 5
/
util.py
433 lines (349 loc) · 15.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
from __future__ import division
import datetime
__author__ = 'James Robert Lloyd'
__description__ = 'Miscellaneous utility functions'
import os
import shutil
from glob import glob
import tempfile
import string
import random
import matplotlib.pyplot as pl
import numpy as np
import scipy.io
from sklearn.cross_validation import KFold
import psutil
from constants import SAVE_DIR
import signal
from automl_lib.data_manager import DataManager
from agent import TerminationEx
import time
def move_make(outdir):
"""If outdir exists and isn't empty move its contents to outdir + the date. Then make a fresh outdir."""
if os.path.isdir(outdir):
if os.listdir(outdir): # if it has files, move them
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
os.rename(outdir, outdir+'_'+the_date)
os.makedirs(outdir)
else:
os.makedirs(outdir)
def move_make_file(flnm):
outdir = os.path.dirname(flnm)
if os.path.exists(flnm):
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
os.rename(flnm, flnm+'_'+the_date)
elif not os.path.isdir(outdir):
os.makedirs(outdir)
open(flnm, 'a').close()
# def init_temp_dir():
# if os.path.isdir(TEMP_DIR):
# shutil.rmtree(TEMP_DIR)
# os.mkdir(TEMP_DIR)
def signal_handler(signum, frame):
"""Handlers for signals"""
if signum == 20: # SIGTSTP
signal.pause() # signal.pause puts *only* the calling thread to sleep, rather than stopping all threads
if signum == 25: # SIGCONT
pass # as long as there exists a handler for sigcont then signal.pause will return
if signum == 15: # SIGTERM
raise TerminationEx
def mkstemp_safe(directory, suffix):
"""Avoids a file handle leak present on some operating systems"""
(os_file_handle, file_name) = tempfile.mkstemp(dir=directory, suffix=suffix)
os.close(os_file_handle)
return file_name
def callback_1d(model, bounds, info, x, index, ftrue):
"""
Plot the current posterior, the index, and the value of the current
recommendation.
"""
xmin, xmax = bounds[0]
xx_ = np.linspace(xmin, xmax, 500) # define grid
xx = xx_[:, None]
# ff = ftrue(xx) # compute true function
acq = index(xx) # compute acquisition
mu, s2 = model.posterior(xx) # compute posterior and
lo = mu - 2 * np.sqrt(s2) # quantiles
hi = mu + 2 * np.sqrt(s2)
# ymin, ymax = ff.min(), ff.max() # get plotting ranges
ymin, ymax = lo.min(), hi.max() # get plotting ranges FIXME - remember observed function values
ymin -= 0.2 * (ymax - ymin)
ymax += 0.2 * (ymax - ymin)
kwplot = {'lw': 2, 'alpha': 0.5} # common plotting kwargs
fig = pl.figure(1)
fig.clf()
pl.subplot(221)
# pl.plot(xx, ff, 'k:', **kwplot) # plot true function
pl.plot(xx, mu, 'b-', **kwplot) # plot the posterior and
pl.fill_between(xx_, lo, hi, color='b', alpha=0.1) # uncertainty bands
pl.scatter(info['x'], info['y'], # plot data
marker='o', facecolor='none', zorder=3)
pl.axvline(x, color='r', **kwplot) # latest selection
pl.axvline(info[-1]['xbest'], color='g', **kwplot) # current recommendation
pl.axis((xmin, xmax, ymin, ymax))
pl.ylabel('posterior')
pl.subplot(223)
pl.fill_between(xx_, acq.min(), acq, # plot acquisition
color='r', alpha=0.1)
pl.axis('tight')
pl.axvline(x, color='r', **kwplot) # plot latest selection
pl.xlabel('input')
pl.ylabel('acquisition')
pl.subplot(222)
pl.plot(ftrue(info['xbest']), 'g', **kwplot) # plot performance
pl.axis((0, len(info['xbest']), ymin, ymax))
pl.xlabel('iterations')
pl.ylabel('value of recommendation')
# for ax in fig.axes: # remove tick labels
# ax.set_xticklabels([])
# ax.set_yticklabels([])
pl.draw()
pl.show(block=False)
def mkdir(d):
if not os.path.exists(d):
os.makedirs(d)
def ls(filename):
return sorted(glob(filename))
def colorbrew(i):
"""Nice colors taken from http://colorbrewer2.org/ by David Duvenaud March 2012"""
rgbs = [(228, 26, 28),
(055, 126, 184),
(077, 175, 74),
(152, 78, 163),
(255, 127, 000),
(255, 255, 051),
(166, 86, 040),
(247, 129, 191),
(153, 153, 153),
(000, 000, 000)]
# Convert to [0, 1] range
rgbs = [(r / 255, g / 255, b / 255) for (r, g, b) in rgbs]
# Return color corresponding to index - wrapping round
return rgbs[i % len(rgbs)]
def convert_automl_into_automl_folds(folder, save_folder_root, n_folds=5,
random_state=0, usage='testing'):
"""Convert a dataset in automl format into several folds of automl format"""
# Load data
input_dir, basename = os.path.split(folder)
D = DataManager(basename, input_dir, replace_missing=True, filter_features=True)
X = D.data['X_train']
y = D.data['Y_train']
info = D.info
if not usage is None:
info['usage'] = usage
# Now split into folds and save
folds = KFold(n=X.shape[0], n_folds=n_folds, shuffle=True, random_state=random_state)
for (fold, (train_index, test_index)) in enumerate(folds):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
fold_folder = os.path.join(save_folder_root + '_fold_%02d' % (fold + 1), basename)
mkdir(fold_folder)
fmt = '%f'
np.savetxt(os.path.join(fold_folder, basename + '_train.data'), X_train, fmt=fmt, delimiter=' ')
np.savetxt(os.path.join(fold_folder, basename + '_test.data'), X_test, fmt=fmt, delimiter=' ')
if info['task'] == 'binary.classification':
fmt = '%d'
np.savetxt(os.path.join(fold_folder, basename + '_train.solution'), y_train, fmt=fmt, delimiter=' ')
np.savetxt(os.path.join(fold_folder, basename + '_test.solution'), y_test, fmt=fmt, delimiter=' ')
info['train_num'] = X_train.shape[0]
info['test_num'] = X_test.shape[0]
with open(os.path.join(fold_folder, basename + '_public.info'), 'w') as info_file:
for (key, value) in info.iteritems():
info_file.write('%s = %s\n' % (key, value))
shutil.copy(os.path.join(folder, basename + '_feat.type'), os.path.join(fold_folder, basename + '_feat.type'))
def convert_automl_into_automl_folds_folder(input_folder, save_folder_root, *args, **kwargs):
"""Converts a folder"""
folders = sorted(os.listdir(input_folder))
for folder in folders:
if os.path.isdir(os.path.join(input_folder, folder)):
print('Processing ' + folder)
convert_automl_into_automl_folds(os.path.join(input_folder, folder), save_folder_root,
*args, **kwargs)
def convert_mat_into_automl_folds(filename, save_folder_root, time_budget=300, n_folds=5, input_type='Numerical',
random_state=0, metric='auc_metric', usage='testing', task='binary.classification',
target_type='Binary'):
"""Convert a dataset in .mat format into several folds of automl format"""
# Load data
data = scipy.io.loadmat(filename)
X = data['X']
y = data['y']
data_name = os.path.splitext(os.path.split(filename)[-1])[0]
# Convert data if appropriate
if task == 'binary.classification':
y_max = y.max()
y[y == y_max] = 1
y[y < y_max] = 0
# If input_type is 'infer' we now infer input types
if input_type == 'infer':
raise Exception('I do not know how to infer input types yet')
else:
input_type_list = [input_type] * X.shape[1]
# Create info dictionary
# TODO - some of these defaults need to be changed
info = dict(usage=usage, name=data_name, task=task, target_type=target_type,
feat_type='Numerical', metric=metric, feat_num=X.shape[1],
target_num=1, label_num=0, has_categorical=0, has_missing=0, is_sparse=0,
time_budget=time_budget, valid_num=0)
# Now split into folds and save
folds = KFold(n=X.shape[0], n_folds=n_folds, shuffle=True, random_state=random_state)
for (fold, (train_index, test_index)) in enumerate(folds):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
fold_folder = os.path.join(save_folder_root + '_fold_%02d' % (fold + 1), data_name)
mkdir(fold_folder)
fmt = '%f'
np.savetxt(os.path.join(fold_folder, data_name + '_train.data'), X_train, fmt=fmt, delimiter=' ')
np.savetxt(os.path.join(fold_folder, data_name + '_test.data'), X_test, fmt=fmt, delimiter=' ')
if task == 'binary.classification':
fmt = '%d'
np.savetxt(os.path.join(fold_folder, data_name + '_train.solution'), y_train, fmt=fmt, delimiter=' ')
np.savetxt(os.path.join(fold_folder, data_name + '_test.solution'), y_test, fmt=fmt, delimiter=' ')
info['train_num'] = X_train.shape[0]
info['test_num'] = X_test.shape[0]
with open(os.path.join(fold_folder, data_name + '_public.info'), 'w') as info_file:
for (key, value) in info.iteritems():
info_file.write('%s = %s\n' % (key, value))
with open(os.path.join(fold_folder, data_name + '_feat.type'), 'w') as feature_file:
for feat_type in input_type_list:
feature_file.write('%s\n' % feat_type)
def convert_mat_into_automl_folds_folder(mat_folder, save_folder_root, *args, **kwargs):
"""Converts a folder"""
filenames = sorted(os.listdir(mat_folder))
for filename in filenames:
if filename.endswith('.mat'):
print('Processing ' + filename)
convert_mat_into_automl_folds(os.path.join(mat_folder, filename), save_folder_root,
*args, **kwargs)
def create_synthetic_classification_problems(mat_folder, save_folder_root, synth_kwargs_list):
pass
def VmB(pid, VmKey):
scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
'KB': 1024.0, 'MB': 1024.0*1024.0}
try:
t = open('/proc/%d/status' % pid)
v = t.read()
t.close()
except:
return -1 # non-Linux?
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
i = v.index(VmKey)
v = v[i:].split(None, 3) # whitespace
if len(v) < 3:
return 0.0 # invalid format?
# convert Vm value to bytes
return float(v[1]) * scale[v[2]]
def memory_usage(pid, since=0.0):
"""Return memory usage in bytes."""
return VmB(pid, 'VmSize:') - since
def resident_memory_usage(pid, since=0.0):
"""Return resident memory usage in bytes."""
return VmB(pid, 'VmRSS:') - since
def random_string(N=20):
return ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.ascii_lowercase + string.digits)
for _ in range(N))
def random_temp_file_name(suffix='.file', dirnm=SAVE_DIR, N=20):
return os.path.join(dirnm, random_string(N=N) + suffix)
def pack_list(a_list):
return [element for element in a_list if not element is None]
def merge_dicts(*dict_args):
"""
Given any number of dicts, shallow copy and merge into a new dict,
precedence goes to key value pairs in latter dicts.
"""
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result
def dict_of_lists_to_arrays(a_dict):
for key, value in a_dict.iteritems():
a_dict[key] = np.array(value)
return a_dict
def identity(i):
return i
def classes_to_1_of_k(classes, k):
result = np.zeros((classes.shape[0], k))
for i, a_class in enumerate(classes):
result[i, a_class] = 1
return result
def append_1_of_k_encoded_data(D):
"""Takes a data manager object and creates 1 of k encodings for targets for scoring purposes"""
k = D.info['label_num']
if 'Y_train' in D.data:
D.data['Y_train_1_of_k'] = classes_to_1_of_k(classes=D.data['Y_train'], k=k)
if 'Y_valid' in D.data:
D.data['Y_valid_1_of_k'] = classes_to_1_of_k(classes=D.data['Y_valid'], k=k)
if 'Y_test' in D.data:
D.data['Y_test_1_of_k'] = classes_to_1_of_k(classes=D.data['Y_test'], k=k)
def load_1_of_k_data(folder, basename, D):
if 'Y_train' in D.data:
D.data['Y_train_1_of_k'] = np.loadtxt(os.path.join(folder, basename, basename + '_train.solution'),
dtype=np.int8)
if 'Y_valid' in D.data:
D.data['Y_valid_1_of_k'] = np.loadtxt(os.path.join(folder, basename, basename + '_valid.solution'),
dtype=np.int8)
if 'Y_test' in D.data:
D.data['Y_test_1_of_k'] = np.loadtxt(os.path.join(folder, basename, basename + '_test.solution'),
dtype=np.int8)
def murder_family(pid=None, pgid=None, killall=False, sig=signal.SIGTERM):
"""Send signal to all processes in the process group pgid, apart from pid.
If killall is true, signal to pid as well.
SIGTERM - die nicely
SIGKILL - die right now
SIGSTOP - uncatchable stop signal - might break queues
SIGTSTP - makes main thread sleep (with signal.pause()) until sigcont received
SIGCONT - start anything that is paused or stopped"""
if pid is None:
pid = os.getpid() # pid of current process
if pgid is None:
pgid = os.getpgid(pid) # process group of pid
for ps in psutil.process_iter():
if os.getpgid(ps.pid) == pgid and ps.pid != pid:
try:
ps.send_signal(sig)
except (psutil.NoSuchProcess, psutil.AccessDenied, IOError):
pass
if killall is True:
try:
ps = psutil.Process(pid=pid)
ps.send_signal(sig)
except (psutil.NoSuchProcess, psutil.AccessDenied, IOError):
pass
def ensure_2d(an_array):
result = np.array(an_array, ndmin=2)
if result.shape[0] == 1:
result = result.T
return result
def csv_file_size(file_name, delimiter=' '):
# Peek at first line
with open(file_name, 'rU') as data_file:
first_line = data_file.readline()
elements = first_line.split(delimiter)
# Strip elements of any empty values
elements = [el for el in elements if not el == '']
n_col = len(elements)
# Count the lines
n_row = 0
with open(file_name, 'rU') as data_file:
for line in data_file:
n_row += 1
return n_row, n_col
class NotAnArray(object):
"""Records file location and size of array"""
def __init__(self, file_name, delimiter=' '):
# Load file size
self.n_row, self.n_col = csv_file_size(file_name=file_name, delimiter=delimiter)
# Remember the filename
self.file_name = file_name
@property
def shape(self):
return self.n_row, self.n_col
def is_sorted(a_list):
return all(a_list[i] <= a_list[i+1] for i in xrange(len(a_list)-1))
def waste_cpu_time(a_time):
start_time = time.clock()
# foo = 'bar'
while time.clock() < start_time + a_time:
# foo = foo[1:] + foo[0]
pass
if __name__ == '__main__':
convert_automl_into_automl_folds_folder('../data/phase_1', '../data/phase_1_cv')