Skip to content

Commit

Permalink
add folding reweighter (code and test) for issue arogozhnikov#28
Browse files Browse the repository at this point in the history
  • Loading branch information
tata-antares committed Nov 9, 2016
1 parent 2ccfbb2 commit 3b9ae63
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 20 deletions.
103 changes: 102 additions & 1 deletion hep_ml/reweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,17 @@
from __future__ import division, print_function, absolute_import

from sklearn.base import BaseEstimator
from sklearn.cross_validation import KFold, check_random_state
from sklearn import clone

from scipy.ndimage import gaussian_filter
import numpy

from .commonutils import check_sample_weight, weighted_quantile
from . import gradientboosting as gb
from . import losses

__author__ = 'Alex Rogozhnikov'
__author__ = 'Alex Rogozhnikov, Tatiana Likhomanenko'
__all__ = ['BinsReweighter', 'GBReweighter']


Expand Down Expand Up @@ -251,3 +254,101 @@ def predict_weights(self, original, original_weight=None):
original, original_weight = self._normalize_input(original, original_weight)
multipliers = numpy.exp(self.gb.decision_function(original))
return multipliers * original_weight


class FoldingReweighter(BaseEstimator, ReweighterMixin):
def __init__(self, base_reweighter, n_folds=2, random_state=None):
"""
This meta-regressor implements folding algorithm over reweighter:
* training data is splitted into n equal parts;
* we train n reweighters, each one is trained using n-1 folds
To build unbiased predictions for data, pass the **same** dataset (with same order of events)
as in training to `predict_weights`, in which case
a reweighter will be used to predict each event that the reweighter didn't use it during training.
To use information from not one, but several reweighters during predictions,
provide appropriate voting function. Examples of voting function:
>>> voting = lambda x: numpy.mean(x, axis=0)
>>> voting = lambda x: numpy.median(x, axis=0)
:param base_reweighter: base reweighter object
:type base_reweighter: ReweighterMixin
:param n_folds: number of folds
:param random_state: random state for reproducibility
:type random_state: None or int or RandomState
"""
self.n_folds = n_folds
self.random_state = random_state
self.base_reweighter = base_reweighter
self.reweighters = []
self._random_number = None
self.train_length = None

def _get_folds_column(self, length):
"""
Return special column with indices of folds for all events.
"""
if self._random_number is None:
self._random_number = check_random_state(self.random_state).randint(0, 100000)
folds_column = numpy.zeros(length)
for fold_number, (_, folds_indices) in enumerate(
KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)):
folds_column[folds_indices] = fold_number
return folds_column

def fit(self, original, target, original_weight, target_weight):
"""
Prepare reweighting formula by training sequence of trees.
:param original: values from original distribution, array-like of shape [n_samples, n_features]
:param target: values from target distribution, array-like of shape [n_samples, n_features]
:param original_weight: weights for samples of original distributions
:param target_weight: weights for samples of original distributions
:return: self
"""
folds_original = self._get_folds_column(len(original))
folds_target = self._get_folds_column(len(target))
for _ in range(self.n_folds):
self.reweighters.append(clone(self.base_reweighter))

original = numpy.array(original)
target = numpy.array(target)
for i in range(self.n_folds):
self.reweighters[i].fit(original[folds_original == i, :], target[folds_target == i, :],
original_weight[folds_original == i], target_weight[folds_target == i])
self.train_length = len(original)
return self

def predict_weights(self, original, original_weight=None, vote_function=None):
"""
Returns corrected weights. Result is computed as original_weight * reweighter_multipliers.
:param original: values from original distribution of shape [n_samples, n_features]
:param original_weight: weights of samples before reweighting.
:return: numpy.array of shape [n_samples] with new weights.
:param vote_function: if using averaging over predictions of folds, this function shall be passed.
For instance: lambda x: numpy.mean(x, axis=0), which means averaging result over all folds.
Another useful option is lambda x: numpy.median(x, axis=0)
"""
if vote_function is not None:
print('KFold prediction with voting function')
results = []
for reweighter in self.reweighters:
results.append(reweighter.predict_weights(original, original_weight=original_weight))
# results: [n_classifiers, n_samples], reduction over 0th axis
results = numpy.array(results)
return vote_function(results)
else:
if len(original) != self.train_length:
print('KFold prediction using random reweighter (length of data passed not equal to length of train)')
else:
print('KFold prediction using folds column')
folds_original = self._get_folds_column(len(original))
new_original_weight = numpy.zeros(len(original))
original = numpy.array(original)
for i in range(self.n_folds):
new_original_weight[folds_original == i] = self.reweighters[i].predict_weights(
original[folds_original == i, :], original_weight=original_weight[folds_original == i])
return new_original_weight
58 changes: 39 additions & 19 deletions tests/test_reweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy

from hep_ml.reweight import BinsReweighter, GBReweighter
from hep_ml.reweight import BinsReweighter, GBReweighter, FoldingReweighter
from hep_ml.metrics_utils import ks_2samp_weighted

__author__ = 'Alex Rogozhnikov'
Expand All @@ -16,7 +16,7 @@ def weighted_covar(data, weights):
return numpy.einsum('ij, ik, i -> jk', data, data, weights)


def check_reweighter(n_dimensions, n_samples, reweighter):
def check_reweighter(n_dimensions, n_samples, reweighter, folding=False):
mean_original = numpy.random.normal(size=n_dimensions)
cov_original = numpy.diag([1.] * n_dimensions)

Expand All @@ -30,26 +30,36 @@ def check_reweighter(n_dimensions, n_samples, reweighter):
target_weight = numpy.ones(n_samples)

reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight)
new_weights = reweighter.predict_weights(original, original_weight=original_weight)
new_weights_array = []
new_weights_array.append(reweighter.predict_weights(original, original_weight=original_weight))
if folding:
def mean_vote(x):
return numpy.mean(x, axis=0)

av_orig = numpy.average(original, weights=original_weight, axis=0)
print('WAS', av_orig)
av_now = numpy.average(original, weights=new_weights, axis=0)
print('NOW:', av_now)
av_ideal = numpy.average(target, weights=target_weight, axis=0)
print('IDEAL:', av_ideal)
new_weights_array.append(reweighter.predict_weights(original, original_weight=original_weight,
vote_function=mean_vote))

for new_weights in new_weights_array:
av_orig = numpy.average(original, weights=original_weight, axis=0)
print('WAS', av_orig)
av_now = numpy.average(original, weights=new_weights, axis=0)
print('NOW:', av_now)
av_ideal = numpy.average(target, weights=target_weight, axis=0)
print('IDEAL:', av_ideal)

print('COVARIATION')
print('WAS', weighted_covar(original, original_weight))
print('NOW', weighted_covar(original, new_weights))
print('IDEAL', weighted_covar(target, target_weight))

assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong'
for dim in range(n_dimensions):
diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight)
diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight)
print('KS', diff1, diff2)
assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)

print('COVARIATION')
print('WAS', weighted_covar(original, original_weight))
print('NOW', weighted_covar(original, new_weights))
print('IDEAL', weighted_covar(target, target_weight))

assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong'
for dim in range(n_dimensions):
diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight)
diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight)
print('KS', diff1, diff2)
assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)


def test_reweighter_1d():
Expand All @@ -74,3 +84,13 @@ def test_gb_reweighter_2d():
def test_gb_reweighter_2d_new():
reweighter = GBReweighter(max_depth=3, n_estimators=30, learning_rate=0.3, gb_args=dict(subsample=0.3))
check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter)


def test_folding_gb_reweighter():
reweighter = FoldingReweighter(GBReweighter(n_estimators=100, max_depth=2), n_folds=3)
check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter, folding=True)


def test_folding_bins_reweighter():
reweighter = FoldingReweighter(BinsReweighter(n_bins=20, n_neighs=2), n_folds=3)
check_reweighter(n_dimensions=2, n_samples=1000000, reweighter=reweighter, folding=True)

0 comments on commit 3b9ae63

Please sign in to comment.