Integrate scikit-learn's set_output method into TransactionEncoder (#1087)

it176131 · rasbt · web-flow · commit 506a4d570a82 · 2024-03-30T14:05:36.000-05:00
* modified: test_transactionencoder.py - Added two new tests, `test_get_feature_names_out` and `test_set_output`. Passing these tests is a step towards the output of `TransactionEncoder` being formatted as a pandas.DataFramed by default. * modified: transactionencoder.py - Added `get_feature_names_out` method to `TransactionEncoder` to expose the `set_output` method. * modified: tests/test_transactionencoder.py - Updated test to include more checks. It is now back in a failing state. * modified: tests/test_transactionencoder.py - Updated test_set_output docstring to be more explicit. - Added numpy assertion to check that the transformed output columns match the original columns_ attribute for test_set_output. - Added numpy assertion to check that the get_feature_names_out output match the original columns_ attribute for test_get_feature_names_out. * modified: transactionencoder.py - Added logic similar to that in `sklearn.base.ClassNamePrefixFeaturesOutMixin` and `sklearn.base.OneToOneFeatureMixin` for the get_feature_names_out method. * modified: docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb - Updated the user guide to show both the get_feature_names_out method and the set_output method. * modified: docs/sources/CHANGELOG.md - Updated changelog to reflect new features. * modified: docs/sources/CHANGELOG.md - Updated issue number. * modified: docs/sources/CHANGELOG.md - Updated issue number (again) to reflect the PR link instead of the issue link. * modified: mlxtend/preprocessing/transactionencoder.py - Ran isort over imports to fix failing check in PR. * modified: requirements.txt - Increased scikit-learn version to minimum required for set_output to work. * modified: environment.yml - Bumped scikit-learn version up to 1.2.2 to match requirements.txt. * modified: .github/workflows/python-package-conda.yml - Bumped scikit-learn version up to 1.2.2 to match environment.yml and requirements.txt. * modified: mlxtend/preprocessing/tests/test_transactionencoder.py - Updated `test_inverse_transform` to passing state by removing conversion to numpy array. * modified: .github/workflows/python-package-conda.yml - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 modified: environment.yml - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 modified: requirements.txt - Updated scikit-learn version to 1.3.1 to integerate fix from scikit-learn/scikit-learn#27044 * Update mlxtend/preprocessing/transactionencoder.py * Update mlxtend/preprocessing/transactionencoder.py * Update mlxtend/preprocessing/transactionencoder.py --------- Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -6,6 +6,20 @@ The CHANGELOG for the current development version is available at
 [https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md](https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md).
 
 ---
+### Version 0.23.2 (TBD)
+
+##### Downloads
+
+- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.23.2.zip)
+
+- [Source code (tar.gz)](https://github.com/rasbt/mlxtend/archive/v0.23.2.tar.gz)
+
+##### New Features and Enhancements
+
+- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via [it176131](https://github.com/it176131))
+
+
+
 
 ### Version 0.23.2 (TBD)
 
diff --git a/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb b/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb
@@ -89,7 +89,7 @@
        "       [False, False,  True,  True,  True,  True],\n",
        "       [False, False,  True, False,  True,  True],\n",
        "       [False, False,  True, False,  True, False],\n",
-       "       [ True,  True, False, False, False, False]], dtype=bool)"
+       "       [ True,  True, False, False, False, False]])"
       ]
      },
      "execution_count": 2,
@@ -141,7 +141,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute:"
+    "After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute, or the `get_feature_names_out` method:"
    ]
   },
   {
@@ -161,19 +161,71 @@
     }
    ],
    "source": [
-    "te.columns_"
+    "te.columns_  # list of strings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "te.get_feature_names_out()  # numpy.array of strings (objects)."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For our convenience, we can turn theencoded array into a pandas `DataFrame`:"
+    "If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['Apple', 'Beer', 'Chicken', 'Rice'],\n",
+       " ['Apple', 'Beer', 'Rice'],\n",
+       " ['Apple', 'Beer'],\n",
+       " ['Apple', 'Bananas']]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "first4 = te_ary[:4]\n",
+    "te.inverse_transform(first4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For our convenience, we can set the default output to a pandas `DataFrame` with the `set_output` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -294,46 +346,15 @@
        "7   True     True  False    False  False  False"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import pandas as pd\n",
-    "\n",
-    "pd.DataFrame(te_ary, columns=te.columns_)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[['Apple', 'Beer', 'Chicken', 'Rice'],\n",
-       " ['Apple', 'Beer', 'Rice'],\n",
-       " ['Apple', 'Beer'],\n",
-       " ['Apple', 'Bananas']]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "first4 = te_ary[:4]\n",
-    "te.inverse_transform(first4)"
+    "te = TransactionEncoder().set_output(transform=\"pandas\")\n",
+    "te_df = te.fit(dataset).transform(dataset)\n",
+    "te_df"
    ]
   },
   {
@@ -346,7 +367,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -545,13 +568,6 @@
     "with open('../../api_modules/mlxtend.preprocessing/TransactionEncoder.md', 'r') as f:\n",
     "    print(f.read())"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -571,7 +587,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.11.7"
   },
   "toc": {
    "nav_menu": {},
diff --git a/mlxtend/preprocessing/tests/test_transactionencoder.py b/mlxtend/preprocessing/tests/test_transactionencoder.py
@@ -5,6 +5,7 @@
 # License: BSD 3 clause
 
 import numpy as np
+import pandas as pd
 from scipy.sparse import csr_matrix
 from sklearn.base import clone
 
@@ -91,3 +92,27 @@ def test_cloning():
 
     trans = oht2.fit_transform(dataset)
     np.testing.assert_array_equal(expect, trans)
+
+
+def test_get_feature_names_out():
+    """Assert TransactionEncoder has attribute get_feature_names_out."""
+    oht = TransactionEncoder()
+    assert hasattr(oht, "get_feature_names_out")
+    oht.fit(dataset)
+    np.testing.assert_array_equal(oht.get_feature_names_out(), oht.columns_)
+
+
+def test_set_output():
+    """Assert TransactionEncoder has attribute set_output.
+
+    When transform="pandas", the transformed output of
+    TransactionEncoder should be a pandas.DataFrame with the correct
+    column names and the values should match those of the original
+    numpy.array.
+    """
+    oht = TransactionEncoder()
+    assert hasattr(oht, "set_output")
+    oht = oht.set_output(transform="pandas")
+    out = oht.fit_transform(dataset)
+    assert isinstance(out, pd.DataFrame)
+    np.testing.assert_array_equal(out.columns, oht.columns_)
diff --git a/mlxtend/preprocessing/transactionencoder.py b/mlxtend/preprocessing/transactionencoder.py
@@ -7,6 +7,7 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 
 
 class TransactionEncoder(BaseEstimator, TransformerMixin):
@@ -181,3 +182,16 @@ def inverse_transform(self, array):
     def fit_transform(self, X, sparse=False):
         """Fit a TransactionEncoder encoder and transform a dataset."""
         return self.fit(X).transform(X, sparse=sparse)
+
+    def get_feature_names_out(self):
+        """Used to get the column names of pandas output.
+
+        This method combined with the `TransformerMixin` exposes the
+        set_output API to the `TransactionEncoder`. This allows the user
+        to set the transformed output to a `pandas.DataFrame` by default.
+
+        See  https://scikit-learn.org/stable/developers/develop.html#developer-api-set-output
+        for more details.
+        """
+        check_is_fitted(self, attributes="columns_")
+        return _check_feature_names_in(estimator=self, input_features=self.columns_)