diff --git a/docs/source/autodocs/sklearn.rst b/docs/source/autodocs/sklearn.rst index 57b49af1..71ed24bc 100644 --- a/docs/source/autodocs/sklearn.rst +++ b/docs/source/autodocs/sklearn.rst @@ -20,3 +20,10 @@ eli5.sklearn.unhashing .. automodule:: eli5.sklearn.unhashing :members: + + +eli5.sklearn.transform +---------------------- + +.. automodule:: eli5.sklearn.transform + :members: diff --git a/eli5/sklearn/transform.py b/eli5/sklearn/transform.py index 8afd7c72..e82fbabe 100644 --- a/eli5/sklearn/transform.py +++ b/eli5/sklearn/transform.py @@ -1,18 +1,191 @@ """transform_feature_names implementations for scikit-learn transformers + +These are automatically registered for many scikit-learn transformers, but can +be overridden by, for example, registering ``make_tfn_weighted`` with +non-default options for a decomposition transformer (such as PCA). """ +import itertools +import operator + import numpy as np # type: ignore +from scipy import sparse # type: ignore +import six # type: ignore from sklearn.pipeline import Pipeline, FeatureUnion # type: ignore from sklearn.feature_selection.base import SelectorMixin # type: ignore from sklearn.linear_model import ( # type: ignore RandomizedLogisticRegression, RandomizedLasso, ) +from sklearn.feature_extraction.text import TfidfTransformer # type: ignore +from sklearn.decomposition import ( # type: ignore + PCA, + IncrementalPCA, + FactorAnalysis, + FastICA, + TruncatedSVD, + NMF, + SparsePCA, + MiniBatchSparsePCA, + SparseCoder, + DictionaryLearning, + MiniBatchDictionaryLearning, + LatentDirichletAllocation) from eli5.transform import transform_feature_names from eli5.sklearn.utils import get_feature_names as _get_feature_names +def _attrgetter_or_identity(func): + if isinstance(func, six.string_types): + return operator.attrgetter(func) + return func + + +class make_tfn_weighted: + """Makes feature names representing a weighted sum of input features + + An instance can be registered as the handler of ``transform_feature_names`` + for linear transformers. + + Parameters + ---------- + get_weights : callable or str, optional if top=0 + A function or attribute name which, applied to the registered + transformer, will return an array of shape (n_outputs, n_inputs) + or (n_inputs,) for diagonal, describing the linear combination of + inputs to produce outputs. + top : int, default=3 + Maximum number of input features to show as contributing to output + features. + threshold : float, default 0 + A contributing feature will only be shown if the absolute value of + its weight exceeds this value. + show_weight : bool or str, default True + Whether or not to show the weights of contributing features. + When a str, is the format of those weights. Defaults to '0.3g'. + show_idx : bool, default True + Whether or not to number the output features so that it is easy to + identify the component. + func_name : str, optional + Specifies constant text to prepend to output feature number. + """ + # TODO: Perhaps have some kind of relative thresholding, either + # threshold as a function of the weight matrix, or threshold + # relative to highest/immediately bigger contribution to an output + # feature. + def __init__(self, get_weights, top=3, threshold=0, + show_weight=True, show_idx=True, func_name=None): + if not any([top, show_idx, func_name]): + raise ValueError('At least one of top, show_idx and ' + 'func_name must be set') + if threshold < 0: + raise ValueError('Threshold must be >= 0') + self.main_fmt, self.contrib_fmt, self.contrib_sep = self.build_formats( + top, show_weight, show_idx, func_name) + self.get_weights = _attrgetter_or_identity(get_weights) + self.top = top + self.threshold = threshold + + @staticmethod + def build_formats(top, show_weight, show_idx, func_name, + contrib_fmt_fmt='{{name}}*{{weight:{}}}', + contrib_sep='+'): + if show_idx and top: + main_fmt = '{idx:d}:=({feats})' + elif show_idx: + main_fmt = '{idx:d}' + elif top: + main_fmt = '({feats})' + else: + main_fmt = '' + if func_name: + escaped_func_name = func_name.replace('{', '{{').replace('}', '}}') + main_fmt = escaped_func_name + main_fmt + + if not top: + contrib_fmt = None + contrib_sep = None + elif show_weight: + if show_weight is True: + show_weight = '0.3g' + contrib_fmt = contrib_fmt_fmt.format(show_weight) + else: + contrib_fmt = '{name}' + + return main_fmt, contrib_fmt, contrib_sep + + @staticmethod + def find_contributors(W, top, threshold): + if sparse.issparse(W): + W = W.tocsr(copy=True) + W.sum_duplicates() + W_abs = abs(W) + + def _find_contribs(idx, w, w_abs): + order = np.argsort(w_abs)[-top:][::-1] + order = order[w_abs[order] > threshold] + return idx[order], w[order] + + return (_find_contribs(W.indices[start:stop], + W.data[start:stop], + W_abs.data[start:stop]) + for start, stop in zip(W.indptr, W.indptr[1:])) + + else: + W_abs = abs(W) + # TODO: use argpartition? + top_idx = np.argsort(W_abs, axis=1)[:, -top:][:, ::-1] + ix0 = np.arange(W_abs.shape[0]).repeat(top_idx.shape[1]) + ix0 = ix0.reshape(-1, top_idx.shape[1]) + n_contribs = (W_abs[ix0, top_idx] > threshold).sum(axis=1) + return ((idx[:n], w[:n]) for idx, w, n in + zip(top_idx, W[ix0, top_idx], n_contribs)) + + def __call__(self, est, in_names=None): + W = self.get_weights(est) + if W.ndim == 1: + # XXX: This implementation is inefficient and could be rewritten + W = sparse.csr_matrix((W.copy(), np.arange(len(W)), + np.arange(len(W) + 1))) + + in_names = _get_feature_names(est, feature_names=in_names, + num_features=W.shape[1]) + + if self.top: + fmt = self.contrib_fmt.format + contribs = self.find_contributors(W, self.top, self.threshold) + contribs = list(contribs) + contrib_sep = self.contrib_sep + contribs = (contrib_sep.join(fmt(name=in_names[i], weight=w) + for i, w in zip(idx, weights)) + for idx, weights in contribs) + else: + contribs = itertools.repeat('', W.shape[0]) + + return [self.main_fmt.format(idx=i, feats=feats) + for i, feats in enumerate(contribs)] + + +# Non-trivial scaling: + +transform_feature_names.register(TfidfTransformer)( # type: ignore + make_tfn_weighted('idf_', func_name='TFIDF', show_idx=False)) + +# Decomposition (linear weights): + +for cls, prefix in [(PCA, 'PCA'), (IncrementalPCA, 'PCA'), + (FactorAnalysis, 'FA'), (FastICA, 'ICA'), + (TruncatedSVD, 'SVD'), (NMF, 'NMF'), + (SparsePCA, 'SPCA'), (MiniBatchSparsePCA, 'SPCA'), + (SparseCoder, 'SC'), (DictionaryLearning, 'DL'), + (MiniBatchDictionaryLearning, 'DL'), + (LatentDirichletAllocation, 'LDA')]: + + transform_feature_names.register(cls)( # type: ignore + make_tfn_weighted('components_', func_name=prefix)) + + # Feature selection: @transform_feature_names.register(SelectorMixin) diff --git a/tests/test_sklearn_transform.py b/tests/test_sklearn_transform.py index 3decd235..b0e4d3e7 100644 --- a/tests/test_sklearn_transform.py +++ b/tests/test_sklearn_transform.py @@ -1,7 +1,12 @@ import re +import pickle import pytest import numpy as np +from scipy import sparse +from hypothesis import given, example, assume, settings as hyp_settings +from hypothesis import strategies as st +from hypothesis.extra import numpy as np_st from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_selection import ( SelectPercentile, @@ -20,8 +25,25 @@ RandomizedLogisticRegression, RandomizedLasso, # TODO: add tests and document ) +from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import FeatureUnion +from sklearn.decomposition import ( # type: ignore + PCA, + IncrementalPCA, + FactorAnalysis, + FastICA, + TruncatedSVD, + NMF, + SparsePCA, + MiniBatchSparsePCA, + SparseCoder, + DictionaryLearning, + MiniBatchDictionaryLearning, + LatentDirichletAllocation) + + from eli5 import transform_feature_names +from eli5.sklearn.transform import make_tfn_weighted class MyFeatureExtractor(BaseEstimator, TransformerMixin): @@ -36,40 +58,272 @@ def get_feature_names(self): def selection_score_func(X, y): - return np.array([1, 2, 3, 4]) + return np.arange(X.shape[1]) @pytest.mark.parametrize('transformer,expected', [ (MyFeatureExtractor(), ['f1', 'f2', 'f3']), - (SelectKBest(selection_score_func, k=1), - ['']), - (SelectKBest(selection_score_func, k=2), - ['', '']), - (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)), - ('p', SelectPercentile(selection_score_func, 30))]), - ['k:', 'k:', 'p:']), - (VarianceThreshold(0.0), ['', '', '', '']), - (VarianceThreshold(1.0), ['']), - (GenericUnivariateSelect(), ['']), - (GenericUnivariateSelect(mode='k_best', param=2), ['', '']), + (SelectKBest(selection_score_func, k=1), ['x3']), + (SelectKBest(selection_score_func, k=2), ['x2', 'x3']), + (VarianceThreshold(0.0), ['x0', 'x1', 'x2', 'x3']), + (VarianceThreshold(1.0), ['x2']), + (GenericUnivariateSelect(), ['x2']), + (GenericUnivariateSelect(mode='k_best', param=2), ['x2', 'x3']), (SelectFromModel(LogisticRegression('l1', C=0.01, random_state=42)), - ['', '']), - (RFE(LogisticRegression(random_state=42), 2), - ['', '']), - (RFECV(LogisticRegression(random_state=42)), - ['', '', '', '']), - (RandomizedLogisticRegression(random_state=42), - ['', '', '']), + ['x0', 'x2']), + (RFE(LogisticRegression(random_state=42), 2), ['x1', 'x3']), + (RFECV(LogisticRegression(random_state=42)), ['x0', 'x1', 'x2', 'x3']), + (RandomizedLogisticRegression(random_state=42), ['x1', 'x2', 'x3']), + (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)), + ('p', SelectPercentile(selection_score_func, 25))]), + ['k:x2', 'k:x3', 'p:x3']), + # Decompositions with 2 components each weighting 3 features + (PCA(n_components=2), + [r'PCA0:=\((x[0-9]\*.+){3}\)', r'PCA1:=\((x[0-9]\*.+){3}\)']), + (IncrementalPCA(n_components=2), + [r'PCA0:=\((x[0-9]\*.+){3}\)', r'PCA1:=\((x[0-9]\*.+){3}\)']), + (FactorAnalysis(n_components=2), + [r'FA0:=\((x[0-9]\*.+){3}\)', r'FA1:=\((x[0-9]\*.+){3}\)']), + (FastICA(n_components=2), + [r'ICA0:=\((x[0-9]\*.+){3}\)', r'ICA1:=\((x[0-9]\*.+){3}\)']), + (TruncatedSVD(n_components=2), + [r'SVD0:=\((x[0-9]\*.+){3}\)', r'SVD1:=\((x[0-9]\*.+){3}\)']), + (NMF(n_components=2), + [r'NMF0:=\((x[0-9]\*.+){3}\)', r'NMF1:=\((x[0-9]\*.+){3}\)']), + (SparsePCA(n_components=2), + [r'SPCA0:=\((x[0-9]\*.+){3}\)', r'SPCA1:=\((x[0-9]\*.+){3}\)']), + (MiniBatchSparsePCA(n_components=2), + [r'SPCA0:=\((x[0-9]\*.+){3}\)', r'SPCA1:=\((x[0-9]\*.+){3}\)']), + (SparseCoder(dictionary=np.array([[1, 2, 3, 4], [5, 6, 7, 8]])), + [r'SC0:=\((x[0-9]\*.+){3}\)', r'SC1:=\((x[0-9]\*.+){3}\)']), + (DictionaryLearning(n_components=2), + [r'DL0:=\((x[0-9]\*.+){3}\)', r'DL1:=\((x[0-9]\*.+){3}\)']), + (MiniBatchDictionaryLearning(n_components=2), + [r'DL0:=\((x[0-9]\*.+){3}\)', r'DL1:=\((x[0-9]\*.+){3}\)']), + (LatentDirichletAllocation(n_topics=2), + [r'LDA0:=\((x[0-9]\*.+){3}\)', r'LDA1:=\((x[0-9]\*.+){3}\)']), + (TfidfTransformer(), + [r'TFIDF\(x0\*.+\)', r'TFIDF\(x1\*.+\)', r'TFIDF\(x2\*.+\)', + r'TFIDF\(x3\*.+\)', ]), +]) +def test_transform_feature_names_match(transformer, expected, iris_train): + X, y, _, _ = iris_train + transformer.fit(X, y) + actual = transform_feature_names(transformer) + assert len(actual) == len(expected) + for expected_name, actual_name in zip(expected, actual): + assert re.match(expected_name, actual_name) + + +@pytest.mark.parametrize('transformer', [ + SelectKBest(k=2), + FeatureUnion([('k', SelectKBest(k=2)), + ('p', SelectPercentile(percentile=40))]), + LatentDirichletAllocation(), + PCA(), + TruncatedSVD(), + TfidfTransformer(), ]) -def test_transform_feature_names_iris(transformer, expected, iris_train): +def test_transform_feature_names_in_names(transformer, iris_train): X, y, _, _ = iris_train transformer.fit(X, y) - # Test in_names being provided - res = transform_feature_names( - transformer, ['', '', '', ''] - ) - assert res == expected - # Test in_names being None + specified = transform_feature_names( + transformer, + ['', '', '', '']) + # ensure that the subtitution below does something + assert any('', r'x\1', name) - for name in expected] + for name in specified] assert transform_feature_names(transformer, None) == expected_default_names + + for n_in_names in [3, 5]: + with pytest.raises(ValueError) as exc_info: + transform_feature_names(transformer, in_names=['x'] * n_in_names) + assert 'feature_names has a wrong length' in str(exc_info.value) + + +class Namespace: + """Provides attributes otherwise supplied by a fitted transformer""" + + def __init__(self, **d): + self.__dict__.update(d) + + +@st.composite +def float_formats(draw): + alignment = draw(st.sampled_from(['', '<', '>', '^'])) + padding = draw(st.sampled_from(['', '-', '0'])) + width = draw(st.one_of(st.just(''), st.integers(0, 100))) + decimals = str(draw(st.one_of(st.just(''), st.integers(0, 100)))) + if decimals: + decimals = '.' + decimals + float_code = draw(st.sampled_from('fFgGeE')) + return '%s%s%s%s%s' % (alignment, padding, width, decimals, float_code) + + +@given(W=np_st.arrays(np_st.floating_dtypes(), + np_st.array_shapes(min_dims=2, max_dims=2)), + top=st.integers(min_value=0, max_value=15), + threshold=st.floats(min_value=0), + func_name=st.text(), + show_idx=st.booleans(), + show_weight=st.one_of(st.booleans(), float_formats()), + get_weights=st.sampled_from(['weights_', lambda t: t.weights_]), + ) +def test_make_tfn_weighted(get_weights, W, top, threshold, func_name, show_idx, + show_weight): + assume(np.all(np.isfinite(W))) + assume(any([top, func_name, show_idx])) + # Could mock find_contributors() and use fake formats + tfn = make_tfn_weighted(get_weights=get_weights, top=top, + threshold=threshold, + func_name=func_name, show_idx=show_idx, + show_weight=show_weight) + + fmts = tfn.build_formats(top=top, show_weight=show_weight, + show_idx=show_idx, func_name=func_name) + main_fmt, contrib_fmt, contrib_sep = fmts + contrib_sep = contrib_sep or '' + contrib_fmt = contrib_fmt or '' + + est = Namespace(weights_=W) + names = tfn(est) + assert len(names) == W.shape[0] + + for i, contribs in enumerate(tfn.find_contributors(W, top, threshold)): + feats = contrib_sep.join(contrib_fmt.format(name='x' + str(j), + weight=w) + for j, w in zip(*contribs)) + assert main_fmt.format(idx=i, feats=feats) == names[i] + + +@given(w=np_st.arrays(np_st.floating_dtypes(), + np_st.array_shapes(min_dims=1, max_dims=1)), + top=st.integers(min_value=0, max_value=15), + threshold=st.floats(min_value=0), + func_name=st.text(), + show_idx=st.booleans(), + show_weight=st.one_of(st.booleans(), float_formats()), + get_weights=st.sampled_from(['weights_', lambda t: t.weights_]), + ) +@hyp_settings(max_examples=30) +def test_make_tfn_weighted_1d(get_weights, w, top, threshold, func_name, + show_idx, show_weight): + assume(np.all(np.isfinite(w))) + assume(any([top, func_name, show_idx])) + # Could mock find_contributors() and use fake formats + tfn = make_tfn_weighted(get_weights=get_weights, top=top, + threshold=threshold, + func_name=func_name, show_idx=show_idx, + show_weight=show_weight) + assert (tfn(Namespace(weights_=np.diagflat(w))) == + tfn(Namespace(weights_=w))) + + +def test_make_tfn_weighted_invalid_threshold(): + with pytest.raises(ValueError) as exc_info: + make_tfn_weighted('weights_', threshold=-1) + assert 'Threshold must be >= 0' in str(exc_info.value) + + +def test_make_tfn_weighted_nothing_to_show(): + with pytest.raises(ValueError) as exc_info: + make_tfn_weighted('weights_', top=0, show_idx=False) + assert 'At least one' in str(exc_info.value) + + +@pytest.mark.parametrize( + 'top,show_weight,show_idx,main_fmt,contrib_fmt,contrib_sep', [ + (0, False, False, 'X', None, None), + (0, False, True, 'X{idx:d}', None, None), + (0, True, False, 'X', None, None), + (0, True, True, 'X{idx:d}', None, None), + (1, False, False, 'X({feats})', '{name}', '+'), + (1, False, True, 'X{idx:d}:=({feats})', '{name}', '+'), + (1, True, False, 'X({feats})', '{name}*{weight:0.3g}', '+'), + (1, True, True, 'X{idx:d}:=({feats})', '{name}*{weight:0.3g}', '+'), + (1, '.5f', False, 'X({feats})', '{name}*{weight:.5f}', '+'), + (1, '.5f', True, 'X{idx:d}:=({feats})', '{name}*{weight:.5f}', '+'), + (5, True, False, 'X({feats})', '{name}*{weight:0.3g}', '+'), + (5, True, True, 'X{idx:d}:=({feats})', '{name}*{weight:0.3g}', '+'), + (5, '.5E', False, 'X({feats})', '{name}*{weight:.5E}', '+'), + (5, '.5E', True, 'X{idx:d}:=({feats})', '{name}*{weight:.5E}', '+'), + ]) +@pytest.mark.parametrize('func_name,exp_func_name', [ + ('', ''), + (None, ''), + ('blah', 'blah'), + ('bl{}ah', 'bl{{}}ah'), + ('bl{ah', 'bl{{ah'), +]) +def test_make_tfn_weighted_build_formats(top, show_weight, show_idx, func_name, + main_fmt, contrib_fmt, contrib_sep, + exp_func_name): + if not (top or show_idx or func_name): + # this case tested in test_make_tfn_weighted_invalid + return + fmts = make_tfn_weighted.build_formats(top=top, show_weight=show_weight, + show_idx=show_idx, + func_name=func_name) + assert fmts == (main_fmt.replace('X', exp_func_name), + contrib_fmt, contrib_sep) + + +W_ALL_BINARY = [ + [1, 1, 1], + [1, 1, 2], + [1, 2, 1], + [1, 2, 2], + [2, 1, 1], + [2, 1, 2], + [2, 2, 2] +] + + +@pytest.mark.parametrize('convert', [ + np.array, sparse.csr_matrix, sparse.dia_matrix +]) +@given(W=np_st.arrays(st.one_of(np_st.floating_dtypes(), + np_st.integer_dtypes()), + np_st.array_shapes(min_dims=2, max_dims=2)), + top=st.integers(min_value=1, max_value=15), + threshold=st.floats(min_value=0)) +@example(W=W_ALL_BINARY, top=1, threshold=0) # tests tie-breaking +@example(W=W_ALL_BINARY, top=3, threshold=1) # tests threshold boundary +def test_make_tfn_weighted_find_contributors(W, convert, top, threshold): + assume(np.all(np.isfinite(W))) + W = np.array(W) + W_conv = convert(W) + W_conv_pkl = pickle.dumps(W_conv) + contribs = list(make_tfn_weighted.find_contributors(W_conv, top, + threshold)) + assert W_conv_pkl == pickle.dumps(W_conv) # ensure W_conv not modified + + all_idx, all_weights = zip(*contribs) + assert len(all_idx) == W.shape[0] + for i, (idx, weights) in enumerate(zip(all_idx, all_weights)): + abs_weights = np.abs(weights) + # check weights match idx + assert np.all(W[i, idx] == weights) + # check top is satisfied + assert len(idx) <= min(top, W.shape[1]) + # check idx all distinct + assert len(np.unique(idx)) == len(idx) + # check threshold is satisfied + if len(weights) > 0: + assert min(abs_weights) > threshold + # check nothing is bigger in remainder + rem = np.delete(np.arange(W.shape[1]), idx) + if len(rem) > 0: + assert min(abs_weights) >= max(abs(W[i, rem])) + # if could not fill all top slots, remainder is strictly less + if len(idx) < min(top, W.shape[1]): + assert min(abs_weights) > max(abs(W[i, rem])) + # check weights are descending + assert np.all(np.diff(abs_weights) <= 1e-15) + # TODO: could check ties are broken deterministically + + +def test_nested_pipelines(): + # TODO + pass