-
Notifications
You must be signed in to change notification settings - Fork 0
/
metamodel.py
668 lines (515 loc) · 19.1 KB
/
metamodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
from __future__ import division, print_function
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
__all__ = ["MetaModelClassifier", "MetaModelRegressor"]
# TODO: 1. Test one-hot encoding with classifiers in MetaModelClassifier
# 2. Test MetaModelRegressor --> Very basic tests performed
class MetaModelClassifier(object):
"""
Builds two level meta model for classification prediction problems
Parameters
----------
level1 : list
List of instantiated classification models
level2 : class
Instantiated classification model
k : int (default = 5)
Number of folds for cross-validation training to generate meta-features
shuffle : boolean (default = False)
Whether to shuffle the samples before cross-validation
standardize : boolean (default = True)
Whether to standardize inputs
level1_needs_one_hot : list
List of boolean arguments indicating which level 1 models need one-hot-encoding vectors for the labels
level2_needs_one_hot = boolean
Whether the level 2 model needs a one-hot encoded vector for the labels
verbose : boolean (default = False)
Whether to display output summary during training
Returns
-------
self : instance of MetaModel
"""
def __init__(self, level1 = None, level2 = None, k = 5, shuffle = False, standardize = True,
level1_needs_one_hot = None, level2_needs_one_hot = None, verbose = False):
# Check and define class attributes
if isinstance(level1, list) == False:
level1 = [level1]
for model in level1:
assert(hasattr(model, 'predict_proba')), "%s does not have method .predict_proba()" % (model)
self.level1 = level1
self.level2 = level2
self.k = k
self.shuffle = shuffle
self.standardize = standardize
self.verbose = verbose
# Number of level1 models
self.n_level1 = len(level1)
# Check for models that need y transformed into one-hot representation (usually neural networks)
if level1_needs_one_hot == None:
level1_needs_one_hot = self.n_level1*[0]
else:
if isinstance(level1_needs_one_hot, list) == False:
level1_needs_one_hot = [level1_needs_one_hot]
assert(len(level1_needs_one_hot) == self.n_level1), "level1_needs_one_hot length (%d) is not the same as the number of level 1 models (%d)" % (len(level1_needs_one_hot), self.n_level1)
self.level1_needs_one_hot = level1_needs_one_hot
if level2_needs_one_hot == None:
level2_needs_one_hot = 0
self.level2_needs_one_hot = level2_needs_one_hot
# Flag indicating if models trained
self.trained = False
def _get_folds(self, X = None, y = None):
"""
Gets train and test indices for cross-validation.
Parameters
----------
X : 2d array-like
Feature matrix from training data
y : 1d array-like
Array of labels from training data
Returns
-------
fold_generator : generator
A generator that is used to generate training and testing indices for cross-validation
"""
return StratifiedKFold(n_splits = self.k, shuffle = self.shuffle)
@staticmethod
def _one_hot(y = None):
"""
Create one hot encoding representation of y
Parameters
----------
y : 1d array-like
Array of labels from training data
Returns
-------
y_onehot : 2d array-like
Array of one-hot encoded form of y
"""
enc = OneHotEncoder()
return enc.fit(y)
def _standardize_train(self, X = None):
"""
Standardize training features to have mean 0 and variance 1
Parameters
----------
X : 2d array-like
Feature matrix from training data
Returns
-------
X_norm : 2d array-like
Standardized feature matrix of training data
scaler : instance of preprocessing class
Transformation object that can be used to transform testing data
"""
scaler = StandardScaler().fit(X)
return scaler.transform(X), scaler
def _standardize_test(self, X = None, scaler = None):
"""
Standardize testing features to have mean 0 and variance 1
Parameters
----------
X : 2d array-like
Feature matrix from testing data
Returns
-------
X_norm : 2d array-like
Standardized feature matrix of testing data
"""
return scaler.transform(X)
def fit(self, X = None, y = None):
"""
Trains two-level meta-model using k-fold cross-validation to get meta-features from level 1 models
Parameters
----------
X : 2d array-like
Feature matrix from training data
y : 1d array-like
Array of labels from training data
Returns
-------
None
Trains MetaModel so that .predict() method is callable
"""
if self.verbose:
print('\nTraining: {0}-Fold CV to Generate Meta-Features from Level 1 Models...'.format(self.k))
# Reshape y
y = y.ravel()
# Generate indices for cross-validation and preallocate data structures
fold_generator = self._get_folds(X, y) # Generator for CV folds
X_meta = [] # Empty list for meta-features
level1_acc = np.zeros((self.n_level1, self.k)) # Numpy array for level1 model accuracies across folds
n_classes = np.unique(y) # Unique classes
fold = 0 # Fold counter
##########################################################################
####################### GENERATE META-FEATURES ###########################
##########################################################################
# Start cross-validation loop for level 1 models to generate meta-features
for train_idx, test_idx in fold_generator.split(X, y):
if self.verbose:
print('\n\tFold:', fold + 1)
# Temporary storage of meta-features for all level 1 models in current CV split
meta = []
# Loop through each model for current fold and make prediction
for i in xrange(self.n_level1):
# Standardize training data and test data with current CV split if specified
if self.standardize:
X_train, scaler = self._standardize_train(X[train_idx])
X_test = self._standardize_test(X[test_idx], scaler)
else:
X_train, X_test = X[train_idx], X[test_idx]
# Create one-hot encoding of y if specified
if self.level1_needs_one_hot[i]:
y_train, y_test = self._one_hot(y[train_idx]), self._one_hot(y[test_idx])
else:
y_train, y_test = y[train_idx], y[test_idx]
# Train ith level 1 model using features of current CV split, get predictions, insert into meta, get accuracy metric at fold
self.level1[i].fit(X_train, y_train)
y_probs = self.level1[i].predict_proba(X_test)
meta.append(y_probs)
level1_acc[i, fold] = self.score(y_test, y_probs)
# Horizontally concatenate all meta-features in current CV split then append to X_meta list, and increment fold counter
meta = np.hstack((meta))
X_meta.append(meta)
fold = fold + 1
##########################################################################
################## RETRAIN LEVEL 1 MODELS ON ALL DATA ####################
##########################################################################
# Now retrain level 1 models on all data
if self.verbose:
print('\nTraining Level 1 Models on all data...')
# Standardize if needed
if self.standardize:
X_all, self.scalers1 = self._standardize_train(X) # Save scaler for level 1 models
else:
X_all = X.copy()
# Train level 1 models
for i in xrange(self.n_level1):
if self.verbose:
print('\n\tModel {0}'.format(i+1))
# Copy of y in case one-hot changes across models
y_all = y.copy()
if self.level1_needs_one_hot[i]:
y_all = self._one_hot(y)
self.level1[i].fit(X_all, y_all)
##########################################################################
######################### TRAIN LEVEL 2 MODEL ############################
##########################################################################
# Stack all meta-features together then create augmented matrix of original features with meta-features
X_meta = np.vstack(X_meta)
X_aug = np.hstack((X, X_meta))
if self.level2_needs_one_hot:
y_all = self._one_hot(y)
# Train level 2 model
if self.verbose:
print('\nTraining Level 2 Model on all data...')
# Standardize augmented feature set if specified
self.level2.fit(X_aug, y_all)
self.trained = True
# Variable to indicate that models were already trained
if self.verbose:
print('{:-^60}'.format(''))
print('{:^60}'.format('META-MODEL SUMMARY'))
print('{:-^60}\n'.format(''))
# Create strings
if self.standardize:
standardize_str = 'True'
else:
standardize_str = 'False'
if self.shuffle:
shuffle_str = 'True'
else:
shuffle_str = 'False'
print('{0:<25}{1:<25}'.format('Samples:', X.shape[0]))
print('{0:<25}{1:<25}'.format('Classes:', len(n_classes)))
print('{0:<25}{1:<25}'.format('Level 1 Models:',self.n_level1))
print('{0:<25}{1:<25}'.format('Features:', X.shape[1]))
print('{0:<25}{1:<25}'.format('Meta-Features:', X_meta.shape[1]))
print('{0:<25}{1:<25}'.format('Folds:', self.k))
print('{0:<25}{1:<25}'.format('Standardize:', standardize_str))
print('{0:<25}{1:<25}'.format('Shuffle:', shuffle_str))
print('\n -- Average Classification Accuracy : Level 1 Models --\n')
for i in xrange(self.n_level1):
print('\tModel', i+1, ':', np.mean(level1_acc[i, :])) # Average accuracy across folds
def predict(self, X = None):
"""
Predict labels from set of test features
Parameters
----------
X : 2d array-like numpy array
Feature matrix from testing data
Returns
-------
y_hat : 1d array-like
Predicted labels from meta-model
"""
# Make sure models were trained first
assert(self.trained == True), "Error: Need to call .fit() method to train models before calling .predict() method"
# Get size of testing features, preallocate empty array for meta features
X = np.atleast_2d(X) # Ensure that X has 2 dimensions
n = X.shape[0]
X_meta = []
fold = 0
# Standardize testing data if specified
if self.standardize:
X = self._standardize_test(X, self.scalers1) # Use saved scalers for level 1 models from training data
# Loop through each level 1 model and make prediction
for i in xrange(self.n_level1):
# Append meta-features
X_meta.append(self.level1[i].predict_proba(X))
# Concatenate meta-features and augment feature matrix
X_meta = np.hstack(X_meta)
X_aug = np.hstack((X, X_meta))
# Level 2 Model
if self.verbose:
print('\nTesting: Making Predictions For Level 2 Model...\n')
return self.level2.predict(X_aug)
@staticmethod
def score(y_test = None, y_probs = None):
"""
Calculate classification accuracy
Parameters
----------
y_true : 1d array-like
Array of ground truth labels
y_probs : 2d array-like
Array of predicted probabilities
Returns
-------
metric: float
Classification accuracy
"""
y_predict = np.argmax(y_probs, axis = 1)
return np.mean(y_test.ravel() == y_predict.ravel())
class MetaModelRegressor(object):
"""
Builds two level meta model for regression prediction problems
Parameters
----------
level1 : list
List of instantiated regression models
level2 : class
Instantiated regression model
k : int (default = 5)
Number of folds for cross-validation training to generate meta-features
shuffle : boolean (default = False)
Whether to shuffle the samples before cross-validation
standardize : boolean (default = True)
Whether to standardize inputs
verbose : boolean (default = False)
Whether to display output summary during training
Returns
-------
self : instance of MetaModel
"""
def __init__(self, level1 = None, level2 = None, k = 5, shuffle = False, standardize = True,
verbose = False):
# Check and define class attributes
if isinstance(level1, list) == False:
level1 = [level1]
for model in level1:
assert(hasattr(model, 'predict')), "%s does not have method .predict()" % (model)
self.level1 = level1
self.level2 = level2
self.k = k
self.shuffle = shuffle
self.standardize = standardize
self.verbose = verbose
# Number of level1 models
self.n_level1 = len(level1)
# Flag indicating if models trained
self.trained = False
def _get_folds(self, X = None):
"""
Gets train and test indices for cross-validation.
Parameters
----------
X : 2d array-like
Feature matrix from training data
Returns
-------
fold_generator : generator
A generator that is used to generate training and testing indices for cross-validation
"""
return KFold(n_splits = self.k, shuffle = self.shuffle)
def _standardize_train(self, X = None):
"""
Standardize training features to have mean 0 and variance 1
Parameters
----------
X : 2d array-like
Feature matrix from training data
Returns
-------
X_norm : 2d array-like
Standardized feature matrix of training data
scaler : instance of preprocessing class
Transformation object that can be used to transform testing data
"""
scaler = StandardScaler().fit(X)
return scaler.transform(X), scaler
def _standardize_test(self, X = None, scaler = None):
"""
Standardize testing features to have mean 0 and variance 1
Parameters
----------
X : 2d array-like
Feature matrix from testing data
Returns
-------
X_norm : 2d array-like
Standardized feature matrix of testing data
"""
return scaler.transform(X)
def fit(self, X = None, y = None):
"""
Trains two-level meta-model using k-fold cross-validation to get meta-features from level 1 models
Parameters
----------
X : 2d array-like
Feature matrix from training data
y : 1d array-like
Array of labels from training data
Returns
-------
None
Trains MetaModel so that .predict() method is callable
"""
if self.verbose:
print('\nTraining: {0}-Fold CV to Generate Meta-Features from Level 1 Models...'.format(self.k))
# Reshape y
y = y.ravel()
# Generate indices for cross-validation and preallocate data structures
fold_generator = self._get_folds(X) # Generator for CV folds
X_meta = [] # Empty list for meta-features
level1_acc = np.zeros((self.n_level1, self.k)) # Numpy array for level1 model accuracies across folds
fold = 0 # Fold counter
##########################################################################
####################### GENERATE META-FEATURES ###########################
##########################################################################
# Start cross-validation loop for level 1 models to generate meta-features
for train_idx, test_idx in fold_generator.split(X):
if self.verbose:
print('\n\tFold:', fold + 1)
# Temporary storage of meta-features for all level 1 models in current CV split
meta = []
# Loop through each model for current fold and make prediction
for i in xrange(self.n_level1):
# Standardize training data and test data with current CV split if specified
if self.standardize:
X_train, scaler = self._standardize_train(X[train_idx])
X_test = self._standardize_test(X[test_idx], scaler)
else:
X_train, X_test = X[train_idx], X[test_idx]
# Labels for current CV split
y_train, y_test = y[train_idx], y[test_idx]
# Train ith level 1 model using features of current CV split, get predictions, insert into meta, get accuracy metric at fold
self.level1[i].fit(X_train, y_train)
y_hat = self.level1[i].predict(X_test)
meta.append(y_hat.reshape(-1, 1))
level1_acc[i, fold] = self.score(y_test, y_hat)
# Horizontally concatenate all meta-features in current CV split then append to X_meta list, and increment fold counter
meta = np.hstack((meta))
X_meta.append(meta)
fold = fold + 1
##########################################################################
################## RETRAIN LEVEL 1 MODELS ON ALL DATA ####################
##########################################################################
# Now retrain level 1 models on all data
if self.verbose:
print('\nTraining Level 1 Models on all data...')
# Standardize if needed
if self.standardize:
X_all, self.scalers1 = self._standardize_train(X) # Save scaler for level 1 models
else:
X_all = X.copy()
# Train level 1 models
for i in xrange(self.n_level1):
if self.verbose:
print('\n\tModel {0}'.format(i+1))
self.level1[i].fit(X_all, y)
##########################################################################
######################### TRAIN LEVEL 2 MODEL ############################
##########################################################################
# Stack all meta-features together then create augmented matrix of original features with meta-features
X_meta = np.vstack(X_meta)
X_aug = np.hstack((X, X_meta))
# Train level 2 model
if self.verbose:
print('\nTraining Level 2 Model on all data...')
# Standardize augmented feature set if specified
self.level2.fit(X_aug, y)
self.trained = True
# Variable to indicate that models were already trained
if self.verbose:
print('{:-^60}'.format(''))
print('{:^60}'.format('META-MODEL SUMMARY'))
print('{:-^60}\n'.format(''))
# Create strings
if self.standardize:
standardize_str = 'True'
else:
standardize_str = 'False'
if self.shuffle:
shuffle_str = 'True'
else:
shuffle_str = 'False'
print('{0:<25}{1:<25}'.format('Samples:', X.shape[0]))
print('{0:<25}{1:<25}'.format('Level 1 Models:',self.n_level1))
print('{0:<25}{1:<25}'.format('Features:', X.shape[1]))
print('{0:<25}{1:<25}'.format('Meta-Features:', X_meta.shape[1]))
print('{0:<25}{1:<25}'.format('Folds:', self.k))
print('{0:<25}{1:<25}'.format('Standardize:', standardize_str))
print('{0:<25}{1:<25}'.format('Shuffle:', shuffle_str))
print('\n -- Average Mean Squared Error : Level 1 Models --\n')
for i in xrange(self.n_level1):
print('\tModel', i+1, ':', np.mean(level1_acc[i, :])) # Average accuracy across folds
def predict(self, X = None):
"""
Predict labels from set of test features
Parameters
----------
X : 2d array-like numpy array
Feature matrix from testing data
Returns
-------
y_hat : 1d array-like
Predicted labels from meta-model
"""
# Make sure models were trained first
assert(self.trained == True), "Error: Need to call .fit() method to train models before calling .predict() method"
# Get size of testing features, preallocate empty array for meta features
X = np.atleast_2d(X) # Ensure that X has 2 dimensions
n = X.shape[0]
X_meta = []
fold = 0
# Standardize testing data if specified
if self.standardize:
X = self._standardize_test(X, self.scalers1) # Use saved scalers for level 1 models from training data
# Loop through each level 1 model and make prediction
for i in xrange(self.n_level1):
# Append meta-features
X_meta.append(self.level1[i].predict(X).reshape(-1, 1))
# Concatenate meta-features and augment feature matrix
X_meta = np.hstack(X_meta)
X_aug = np.hstack((X, X_meta))
# Level 2 Model
if self.verbose:
print('\nTesting: Making Predictions For Level 2 Model...\n')
return self.level2.predict(X_aug)
@staticmethod
def score(y_test = None, y_predict = None):
"""
Calculate regression accuracy using mean squared error
Parameters
----------
y_true : 1d array-like
Array of ground truth labels
y_predict : 1d array-like
Array of predicted labels
Returns
-------
metric: float
Mean squared error
"""
return np.mean(np.sum((y_test.ravel() - y_predict.ravel()))**2)