Skip to content

Commit 506a4d5

Browse files
it176131rasbt
andauthored
Integrate scikit-learn's set_output method into TransactionEncoder (#1087)
* modified: test_transactionencoder.py - Added two new tests, `test_get_feature_names_out` and `test_set_output`. Passing these tests is a step towards the output of `TransactionEncoder` being formatted as a pandas.DataFramed by default. * modified: transactionencoder.py - Added `get_feature_names_out` method to `TransactionEncoder` to expose the `set_output` method. * modified: tests/test_transactionencoder.py - Updated test to include more checks. It is now back in a failing state. * modified: tests/test_transactionencoder.py - Updated test_set_output docstring to be more explicit. - Added numpy assertion to check that the transformed output columns match the original columns_ attribute for test_set_output. - Added numpy assertion to check that the get_feature_names_out output match the original columns_ attribute for test_get_feature_names_out. * modified: transactionencoder.py - Added logic similar to that in `sklearn.base.ClassNamePrefixFeaturesOutMixin` and `sklearn.base.OneToOneFeatureMixin` for the get_feature_names_out method. * modified: docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb - Updated the user guide to show both the get_feature_names_out method and the set_output method. * modified: docs/sources/CHANGELOG.md - Updated changelog to reflect new features. * modified: docs/sources/CHANGELOG.md - Updated issue number. * modified: docs/sources/CHANGELOG.md - Updated issue number (again) to reflect the PR link instead of the issue link. * modified: mlxtend/preprocessing/transactionencoder.py - Ran isort over imports to fix failing check in PR. * modified: requirements.txt - Increased scikit-learn version to minimum required for set_output to work. * modified: environment.yml - Bumped scikit-learn version up to 1.2.2 to match requirements.txt. * modified: .github/workflows/python-package-conda.yml - Bumped scikit-learn version up to 1.2.2 to match environment.yml and requirements.txt. * modified: mlxtend/preprocessing/tests/test_transactionencoder.py - Updated `test_inverse_transform` to passing state by removing conversion to numpy array. * modified: .github/workflows/python-package-conda.yml - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 modified: environment.yml - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 modified: requirements.txt - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 * Update mlxtend/preprocessing/transactionencoder.py * Update mlxtend/preprocessing/transactionencoder.py * Update mlxtend/preprocessing/transactionencoder.py --------- Co-authored-by: Sebastian Raschka <[email protected]>
1 parent e82c9c5 commit 506a4d5

File tree

4 files changed

+118
-49
lines changed

4 files changed

+118
-49
lines changed

docs/sources/CHANGELOG.md

+14
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,20 @@ The CHANGELOG for the current development version is available at
66
[https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md](https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md).
77

88
---
9+
### Version 0.23.2 (TBD)
10+
11+
##### Downloads
12+
13+
- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.23.2.zip)
14+
15+
- [Source code (tar.gz)](https://github.com/rasbt/mlxtend/archive/v0.23.2.tar.gz)
16+
17+
##### New Features and Enhancements
18+
19+
- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via [it176131](https://github.com/it176131))
20+
21+
22+
923

1024
### Version 0.23.2 (TBD)
1125

docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb

+65-49
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
" [False, False, True, True, True, True],\n",
9090
" [False, False, True, False, True, True],\n",
9191
" [False, False, True, False, True, False],\n",
92-
" [ True, True, False, False, False, False]], dtype=bool)"
92+
" [ True, True, False, False, False, False]])"
9393
]
9494
},
9595
"execution_count": 2,
@@ -141,7 +141,7 @@
141141
"cell_type": "markdown",
142142
"metadata": {},
143143
"source": [
144-
"After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute:"
144+
"After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute, or the `get_feature_names_out` method:"
145145
]
146146
},
147147
{
@@ -161,19 +161,71 @@
161161
}
162162
],
163163
"source": [
164-
"te.columns_"
164+
"te.columns_ # list of strings"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 5,
170+
"metadata": {},
171+
"outputs": [
172+
{
173+
"data": {
174+
"text/plain": [
175+
"array(['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'],\n",
176+
" dtype=object)"
177+
]
178+
},
179+
"execution_count": 5,
180+
"metadata": {},
181+
"output_type": "execute_result"
182+
}
183+
],
184+
"source": [
185+
"te.get_feature_names_out() # numpy.array of strings (objects)."
165186
]
166187
},
167188
{
168189
"cell_type": "markdown",
169190
"metadata": {},
170191
"source": [
171-
"For our convenience, we can turn theencoded array into a pandas `DataFrame`:"
192+
"If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:"
172193
]
173194
},
174195
{
175196
"cell_type": "code",
176-
"execution_count": 5,
197+
"execution_count": 6,
198+
"metadata": {},
199+
"outputs": [
200+
{
201+
"data": {
202+
"text/plain": [
203+
"[['Apple', 'Beer', 'Chicken', 'Rice'],\n",
204+
" ['Apple', 'Beer', 'Rice'],\n",
205+
" ['Apple', 'Beer'],\n",
206+
" ['Apple', 'Bananas']]"
207+
]
208+
},
209+
"execution_count": 6,
210+
"metadata": {},
211+
"output_type": "execute_result"
212+
}
213+
],
214+
"source": [
215+
"first4 = te_ary[:4]\n",
216+
"te.inverse_transform(first4)"
217+
]
218+
},
219+
{
220+
"cell_type": "markdown",
221+
"metadata": {},
222+
"source": [
223+
"For our convenience, we can set the default output to a pandas `DataFrame` with the `set_output` method:"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": 7,
177229
"metadata": {},
178230
"outputs": [
179231
{
@@ -294,46 +346,15 @@
294346
"7 True True False False False False"
295347
]
296348
},
297-
"execution_count": 5,
349+
"execution_count": 7,
298350
"metadata": {},
299351
"output_type": "execute_result"
300352
}
301353
],
302354
"source": [
303-
"import pandas as pd\n",
304-
"\n",
305-
"pd.DataFrame(te_ary, columns=te.columns_)"
306-
]
307-
},
308-
{
309-
"cell_type": "markdown",
310-
"metadata": {},
311-
"source": [
312-
"If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:"
313-
]
314-
},
315-
{
316-
"cell_type": "code",
317-
"execution_count": 6,
318-
"metadata": {},
319-
"outputs": [
320-
{
321-
"data": {
322-
"text/plain": [
323-
"[['Apple', 'Beer', 'Chicken', 'Rice'],\n",
324-
" ['Apple', 'Beer', 'Rice'],\n",
325-
" ['Apple', 'Beer'],\n",
326-
" ['Apple', 'Bananas']]"
327-
]
328-
},
329-
"execution_count": 6,
330-
"metadata": {},
331-
"output_type": "execute_result"
332-
}
333-
],
334-
"source": [
335-
"first4 = te_ary[:4]\n",
336-
"te.inverse_transform(first4)"
355+
"te = TransactionEncoder().set_output(transform=\"pandas\")\n",
356+
"te_df = te.fit(dataset).transform(dataset)\n",
357+
"te_df"
337358
]
338359
},
339360
{
@@ -346,7 +367,9 @@
346367
{
347368
"cell_type": "code",
348369
"execution_count": 3,
349-
"metadata": {},
370+
"metadata": {
371+
"scrolled": true
372+
},
350373
"outputs": [
351374
{
352375
"name": "stdout",
@@ -545,13 +568,6 @@
545568
"with open('../../api_modules/mlxtend.preprocessing/TransactionEncoder.md', 'r') as f:\n",
546569
" print(f.read())"
547570
]
548-
},
549-
{
550-
"cell_type": "code",
551-
"execution_count": null,
552-
"metadata": {},
553-
"outputs": [],
554-
"source": []
555571
}
556572
],
557573
"metadata": {
@@ -571,7 +587,7 @@
571587
"name": "python",
572588
"nbconvert_exporter": "python",
573589
"pygments_lexer": "ipython3",
574-
"version": "3.9.7"
590+
"version": "3.11.7"
575591
},
576592
"toc": {
577593
"nav_menu": {},

mlxtend/preprocessing/tests/test_transactionencoder.py

+25
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# License: BSD 3 clause
66

77
import numpy as np
8+
import pandas as pd
89
from scipy.sparse import csr_matrix
910
from sklearn.base import clone
1011

@@ -91,3 +92,27 @@ def test_cloning():
9192

9293
trans = oht2.fit_transform(dataset)
9394
np.testing.assert_array_equal(expect, trans)
95+
96+
97+
def test_get_feature_names_out():
98+
"""Assert TransactionEncoder has attribute get_feature_names_out."""
99+
oht = TransactionEncoder()
100+
assert hasattr(oht, "get_feature_names_out")
101+
oht.fit(dataset)
102+
np.testing.assert_array_equal(oht.get_feature_names_out(), oht.columns_)
103+
104+
105+
def test_set_output():
106+
"""Assert TransactionEncoder has attribute set_output.
107+
108+
When transform="pandas", the transformed output of
109+
TransactionEncoder should be a pandas.DataFrame with the correct
110+
column names and the values should match those of the original
111+
numpy.array.
112+
"""
113+
oht = TransactionEncoder()
114+
assert hasattr(oht, "set_output")
115+
oht = oht.set_output(transform="pandas")
116+
out = oht.fit_transform(dataset)
117+
assert isinstance(out, pd.DataFrame)
118+
np.testing.assert_array_equal(out.columns, oht.columns_)

mlxtend/preprocessing/transactionencoder.py

+14
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
from scipy.sparse import csr_matrix
99
from sklearn.base import BaseEstimator, TransformerMixin
10+
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
1011

1112

1213
class TransactionEncoder(BaseEstimator, TransformerMixin):
@@ -181,3 +182,16 @@ def inverse_transform(self, array):
181182
def fit_transform(self, X, sparse=False):
182183
"""Fit a TransactionEncoder encoder and transform a dataset."""
183184
return self.fit(X).transform(X, sparse=sparse)
185+
186+
def get_feature_names_out(self):
187+
"""Used to get the column names of pandas output.
188+
189+
This method combined with the `TransformerMixin` exposes the
190+
set_output API to the `TransactionEncoder`. This allows the user
191+
to set the transformed output to a `pandas.DataFrame` by default.
192+
193+
See https://scikit-learn.org/stable/developers/develop.html#developer-api-set-output
194+
for more details.
195+
"""
196+
check_is_fitted(self, attributes="columns_")
197+
return _check_feature_names_in(estimator=self, input_features=self.columns_)

0 commit comments

Comments
 (0)