From 3b9ae6395cf48a882123a99287864e190d79fe41 Mon Sep 17 00:00:00 2001 From: tata-antares Date: Wed, 9 Nov 2016 19:18:06 +0300 Subject: [PATCH] add folding reweighter (code and test) for issue https://github.com/arogozhnikov/hep_ml/issues/28 --- hep_ml/reweight.py | 103 ++++++++++++++++++++++++++++++++++++++++- tests/test_reweight.py | 58 +++++++++++++++-------- 2 files changed, 141 insertions(+), 20 deletions(-) diff --git a/hep_ml/reweight.py b/hep_ml/reweight.py index 1d4a257..8ad3dab 100644 --- a/hep_ml/reweight.py +++ b/hep_ml/reweight.py @@ -40,6 +40,9 @@ from __future__ import division, print_function, absolute_import from sklearn.base import BaseEstimator +from sklearn.cross_validation import KFold, check_random_state +from sklearn import clone + from scipy.ndimage import gaussian_filter import numpy @@ -47,7 +50,7 @@ from . import gradientboosting as gb from . import losses -__author__ = 'Alex Rogozhnikov' +__author__ = 'Alex Rogozhnikov, Tatiana Likhomanenko' __all__ = ['BinsReweighter', 'GBReweighter'] @@ -251,3 +254,101 @@ def predict_weights(self, original, original_weight=None): original, original_weight = self._normalize_input(original, original_weight) multipliers = numpy.exp(self.gb.decision_function(original)) return multipliers * original_weight + + +class FoldingReweighter(BaseEstimator, ReweighterMixin): + def __init__(self, base_reweighter, n_folds=2, random_state=None): + """ + This meta-regressor implements folding algorithm over reweighter: + + * training data is splitted into n equal parts; + + * we train n reweighters, each one is trained using n-1 folds + + To build unbiased predictions for data, pass the **same** dataset (with same order of events) + as in training to `predict_weights`, in which case + a reweighter will be used to predict each event that the reweighter didn't use it during training. + To use information from not one, but several reweighters during predictions, + provide appropriate voting function. Examples of voting function: + >>> voting = lambda x: numpy.mean(x, axis=0) + >>> voting = lambda x: numpy.median(x, axis=0) + + :param base_reweighter: base reweighter object + :type base_reweighter: ReweighterMixin + :param n_folds: number of folds + :param random_state: random state for reproducibility + :type random_state: None or int or RandomState + """ + self.n_folds = n_folds + self.random_state = random_state + self.base_reweighter = base_reweighter + self.reweighters = [] + self._random_number = None + self.train_length = None + + def _get_folds_column(self, length): + """ + Return special column with indices of folds for all events. + """ + if self._random_number is None: + self._random_number = check_random_state(self.random_state).randint(0, 100000) + folds_column = numpy.zeros(length) + for fold_number, (_, folds_indices) in enumerate( + KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)): + folds_column[folds_indices] = fold_number + return folds_column + + def fit(self, original, target, original_weight, target_weight): + """ + Prepare reweighting formula by training sequence of trees. + + :param original: values from original distribution, array-like of shape [n_samples, n_features] + :param target: values from target distribution, array-like of shape [n_samples, n_features] + :param original_weight: weights for samples of original distributions + :param target_weight: weights for samples of original distributions + :return: self + """ + folds_original = self._get_folds_column(len(original)) + folds_target = self._get_folds_column(len(target)) + for _ in range(self.n_folds): + self.reweighters.append(clone(self.base_reweighter)) + + original = numpy.array(original) + target = numpy.array(target) + for i in range(self.n_folds): + self.reweighters[i].fit(original[folds_original == i, :], target[folds_target == i, :], + original_weight[folds_original == i], target_weight[folds_target == i]) + self.train_length = len(original) + return self + + def predict_weights(self, original, original_weight=None, vote_function=None): + """ + Returns corrected weights. Result is computed as original_weight * reweighter_multipliers. + + :param original: values from original distribution of shape [n_samples, n_features] + :param original_weight: weights of samples before reweighting. + :return: numpy.array of shape [n_samples] with new weights. + :param vote_function: if using averaging over predictions of folds, this function shall be passed. + For instance: lambda x: numpy.mean(x, axis=0), which means averaging result over all folds. + Another useful option is lambda x: numpy.median(x, axis=0) + """ + if vote_function is not None: + print('KFold prediction with voting function') + results = [] + for reweighter in self.reweighters: + results.append(reweighter.predict_weights(original, original_weight=original_weight)) + # results: [n_classifiers, n_samples], reduction over 0th axis + results = numpy.array(results) + return vote_function(results) + else: + if len(original) != self.train_length: + print('KFold prediction using random reweighter (length of data passed not equal to length of train)') + else: + print('KFold prediction using folds column') + folds_original = self._get_folds_column(len(original)) + new_original_weight = numpy.zeros(len(original)) + original = numpy.array(original) + for i in range(self.n_folds): + new_original_weight[folds_original == i] = self.reweighters[i].predict_weights( + original[folds_original == i, :], original_weight=original_weight[folds_original == i]) + return new_original_weight diff --git a/tests/test_reweight.py b/tests/test_reweight.py index 576e12a..1054c5f 100644 --- a/tests/test_reweight.py +++ b/tests/test_reweight.py @@ -2,7 +2,7 @@ import numpy -from hep_ml.reweight import BinsReweighter, GBReweighter +from hep_ml.reweight import BinsReweighter, GBReweighter, FoldingReweighter from hep_ml.metrics_utils import ks_2samp_weighted __author__ = 'Alex Rogozhnikov' @@ -16,7 +16,7 @@ def weighted_covar(data, weights): return numpy.einsum('ij, ik, i -> jk', data, data, weights) -def check_reweighter(n_dimensions, n_samples, reweighter): +def check_reweighter(n_dimensions, n_samples, reweighter, folding=False): mean_original = numpy.random.normal(size=n_dimensions) cov_original = numpy.diag([1.] * n_dimensions) @@ -30,26 +30,36 @@ def check_reweighter(n_dimensions, n_samples, reweighter): target_weight = numpy.ones(n_samples) reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight) - new_weights = reweighter.predict_weights(original, original_weight=original_weight) + new_weights_array = [] + new_weights_array.append(reweighter.predict_weights(original, original_weight=original_weight)) + if folding: + def mean_vote(x): + return numpy.mean(x, axis=0) - av_orig = numpy.average(original, weights=original_weight, axis=0) - print('WAS', av_orig) - av_now = numpy.average(original, weights=new_weights, axis=0) - print('NOW:', av_now) - av_ideal = numpy.average(target, weights=target_weight, axis=0) - print('IDEAL:', av_ideal) + new_weights_array.append(reweighter.predict_weights(original, original_weight=original_weight, + vote_function=mean_vote)) + + for new_weights in new_weights_array: + av_orig = numpy.average(original, weights=original_weight, axis=0) + print('WAS', av_orig) + av_now = numpy.average(original, weights=new_weights, axis=0) + print('NOW:', av_now) + av_ideal = numpy.average(target, weights=target_weight, axis=0) + print('IDEAL:', av_ideal) + + print('COVARIATION') + print('WAS', weighted_covar(original, original_weight)) + print('NOW', weighted_covar(original, new_weights)) + print('IDEAL', weighted_covar(target, target_weight)) + + assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong' + for dim in range(n_dimensions): + diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight) + diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight) + print('KS', diff1, diff2) + assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2) - print('COVARIATION') - print('WAS', weighted_covar(original, original_weight)) - print('NOW', weighted_covar(original, new_weights)) - print('IDEAL', weighted_covar(target, target_weight)) - assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong' - for dim in range(n_dimensions): - diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight) - diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight) - print('KS', diff1, diff2) - assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2) def test_reweighter_1d(): @@ -74,3 +84,13 @@ def test_gb_reweighter_2d(): def test_gb_reweighter_2d_new(): reweighter = GBReweighter(max_depth=3, n_estimators=30, learning_rate=0.3, gb_args=dict(subsample=0.3)) check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter) + + +def test_folding_gb_reweighter(): + reweighter = FoldingReweighter(GBReweighter(n_estimators=100, max_depth=2), n_folds=3) + check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter, folding=True) + + +def test_folding_bins_reweighter(): + reweighter = FoldingReweighter(BinsReweighter(n_bins=20, n_neighs=2), n_folds=3) + check_reweighter(n_dimensions=2, n_samples=1000000, reweighter=reweighter, folding=True)