From 11a295e350c9bd57172228ad61bd5bc35d86691a Mon Sep 17 00:00:00 2001 From: Konstantinos Azas <64093265+zazass8@users.noreply.github.com> Date: Wed, 23 Oct 2024 21:26:01 +0300 Subject: [PATCH 1/7] FPGrowth/FPMax and Association Rules with the existence of missing values (#1004) (#1106) * Updated FPGrowth/FPMax and Association Rules with the existence of missing values * Re-structure and document code * Update unit tests * Update CHANGELOG.md * Modify the corresponding documentation in Jupyter notebooks * Final modifications --- docs/sources/CHANGELOG.md | 11 +- .../frequent_patterns/association_rules.ipynb | 360 +++++++++++++++++- .../frequent_patterns/fpgrowth.ipynb | 265 ++++++++++++- .../user_guide/frequent_patterns/fpmax.ipynb | 246 +++++++++++- mlxtend/feature_selection/column_selector.py | 2 +- .../frequent_patterns/association_rules.py | 197 ++++++++-- mlxtend/frequent_patterns/fpcommon.py | 94 ++++- mlxtend/frequent_patterns/fpgrowth.py | 15 +- mlxtend/frequent_patterns/fpmax.py | 15 +- .../tests/test_association_rules.py | 306 +++++++++++++-- 10 files changed, 1405 insertions(+), 106 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 914cce35c..efbf4fe61 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -18,11 +18,18 @@ The CHANGELOG for the current development version is available at ##### New Features and Enhancements -- [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) Implemented three new metrics: Jaccard, Certainty, and Kulczynski. ([#1096](https://github.com/rasbt/mlxtend/issues/1096)) -- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via [it176131](https://github.com/it176131)) +- Implement the FP-Growth and FP-Max algorithms with the possibility of missing values in the input dataset. Added a new metric Representativity for the association rules generated ([#1004](https://github.com/rasbt/mlxtend/issues/1004) via [zazass8](https://github.com/zazass8)). +Files updated: + - ['mlxtend.frequent_patterns.fpcommon'] + - ['mlxtend.frequent_patterns.fpgrowth'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/) + - ['mlxtend.frequent_patterns.fpmax'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/) + - [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) +- [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/)Implemented three new metrics: Jaccard, Certainty, and Kulczynski. ([#1096](https://github.com/rasbt/mlxtend/issues/1096)) +- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via[it176131](https://github.com/it176131)) ##### Changes +- [`mlxtend.frequent_patterns.fpcommon`] Added the null_values parameter in valid_input_check signature to check in case the input also includes null values. Changes the returns statements and function signatures for setup_fptree and generated_itemsets respectively to return the disabled array created and to include it as a parameter. Added code in [`mlxtend.frequent_patterns.fpcommon`] and [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) to implement the algorithms in case null values exist when null_values is True. - [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) Added optional parameter 'return_metrics' to only return a given list of metrics, rather than every possible metric. - Add `n_classes_` attribute to stacking classifiers for compatibility with scikit-learn 1.3 ([#1091](https://github.com/rasbt/mlxtend/issues/1091)) diff --git a/docs/sources/user_guide/frequent_patterns/association_rules.ipynb b/docs/sources/user_guide/frequent_patterns/association_rules.ipynb index 3f963b78f..856d44649 100644 --- a/docs/sources/user_guide/frequent_patterns/association_rules.ipynb +++ b/docs/sources/user_guide/frequent_patterns/association_rules.ipynb @@ -122,6 +122,43 @@ " " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating association rules with th existence of missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As it is already implemented in the FP-Growth/FP-Max algorithms, now the corresponding association rules are generated while addressing the issue of missing information in the input. As before, the supports are used from the algorithm and using these the rest of the metrics are re-formulated in a different way. We still use the so called \"disabled\" array from the original dataframe, where it swaps the null values with ones and the rest with NaNs. For these association rules to make sense, a count corresponding to each sub-case is kept. The cases are when a null value is present in the antecedent, consequent and the combination of both respectively and when there's a NaN in the consequent and all the rest are present in the antecedent and vice versa. According to [11], the metrics are re-defined below:\n", + "\n", + "#### 'support':\n", + " $$\\text{Support}(A\\rightarrow C) = \\frac{|B_{AC}|}{|B| - |\\text{Dis}(AC)|}, \\;\\;\\; \\text{range: } [0, 1]$$\n", + "\n", + "where $|B_{AC}|$ is the count of both A and C occuring/existing, $|B|$ is the number of transactions and $|\\text{Dis}(AC)|$ is the count kept if there is a NaN either in A or C, since \n", + "\n", + " $$\\text{Dis}(AC) = \\text{Dis}(A)\\cup\\text{Dis}(C)$$\n", + "\n", + "#### 'confidence':\n", + " $$\\text{Confidence}(A\\rightarrow C) = \\frac{|B_{AC}|}{|B_{A}| - |\\text{Dis}(C)\\cap B_{A}|}, \\;\\;\\; \\text{range: } [0, 1]$$\n", + "\n", + "where $|\\text{Dis}(C)\\cap B_{A}|$ is the count kept if there is a NaN in C AND an occurence of existence in A. In the code, this formula has been re-arranged using the supports obtained from the algorithm and is formulated as \n", + " `sAC*(num_itemsets - disAC) / (sA*(num_itemsets - disA) - dis_int)`\n", + "where `sAC*(num_itemsets - disAC)` is the count kept both in A and C, `sA*(num_itemsets - disA)` is the count kept in A and `dis_int` is the term mentioned above.\n", + "\n", + "#### 'representativity':\n", + " $$\\text{Representativity}(A) = \\frac{|B| - |\\text{Dis}(A)|}{|B|}, \\;\\;\\; \\text{range: } [0, 1]$$\n", + "\n", + "- introduced in [11]\n", + "\n", + "A new metric induced according to [11], that essentially represents how much information is present in itemset A across all the transactions in the database.\n", + "\n", + "The rest of the metrics are derived according to re-formulated support and confidence metrics, while their formulas are kept identical as before but given the \"new\" support and confidence." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -151,7 +188,9 @@ "\n", "[9] Berzal Fernando, Blanco Ignacio, Sánchez Daniel, Vila, María-Amparo. Measuring the accuracy and interest of association rules: A new framework. Intelligent Data Analysis, Volume 6, no. 3, 2002, Pages 221-235.\n", "\n", - "[10] Wu, T., Chen, Y., Han, J. Re-examination of interestingness measures in pattern mining: a unified framework. Data Min Knowl Disc 21, 371–397 (2010). https://doi.org/10.1007/s10618-009-0161-2." + "[10] Wu, T., Chen, Y., Han, J. Re-examination of interestingness measures in pattern mining: a unified framework. Data Min Knowl Disc 21, 371–397 (2010). https://doi.org/10.1007/s10618-009-0161-2.\n", + "\n", + "[11] Ragel, A. and Crémilleux, B., 1998. \"[Treatment of missing values for association rules](https://link.springer.com/chapter/10.1007/3-540-64383-4_22)\". In Research and Development in Knowledge Discovery and Data Mining: Second Pacific-Asia Conference, PAKDD-98 Melbourne, Australia, April 15–17, 1998 Proceedings 2 (pp. 258-270). Springer Berlin Heidelberg." ] }, { @@ -459,6 +498,316 @@ "rules.loc[ ~final_sele ]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 5 -- Generating Association Rules from data with missing information" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_10132\\2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AppleCornDillEggsIce creamKidney BeansMilkNutmegOnionUnicornYogurt
0FalseFalseFalseTrueFalseTrueTrueTrueNaNFalseTrue
1FalseNaNTrueTrueFalseTrueFalseTrueTrueFalseNaN
2TrueFalseNaNNaNFalseTrueTrueFalseFalseFalseNaN
3FalseTrueFalseFalseFalseTrueNaNFalseFalseTrueTrue
4FalseNaNFalseNaNTrueTrueFalseFalseTrueFalseFalse
\n", + "
" + ], + "text/plain": [ + " Apple Corn Dill Eggs Ice cream Kidney Beans Milk Nutmeg Onion \\\n", + "0 False False False True False True True True NaN \n", + "1 False NaN True True False True False True True \n", + "2 True False NaN NaN False True True False False \n", + "3 False True False False False True NaN False False \n", + "4 False NaN False NaN True True False False True \n", + "\n", + " Unicorn Yogurt \n", + "0 False True \n", + "1 False NaN \n", + "2 False NaN \n", + "3 True True \n", + "4 False False " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from mlxtend.preprocessing import TransactionEncoder\n", + "from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth\n", + "from mlxtend.frequent_patterns import association_rules\n", + "\n", + "\n", + "dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],\n", + " ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],\n", + " ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],\n", + " ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],\n", + " ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]\n", + "\n", + "te = TransactionEncoder()\n", + "te_ary = te.fit(dataset).transform(dataset)\n", + "df = pd.DataFrame(te_ary, columns=te.columns_)\n", + "\n", + "rows, columns = df.shape\n", + "idx = np.random.randint(0, rows, 10)\n", + "col = np.random.randint(0, columns, 10)\n", + "\n", + "for i in range(10):\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below, shows the same implementations as above but with the case when a dataset has missing values. The function still allows you to (1) specify your metric of interest and (2) the according threshold. Now we have to set `null_values=True` to both `fpgrowth`/`fpmax` and also include the original `df` and its size as parameters to the function. We try the example below using `metric=\"confidence\"` and `min_threshold=0.8`." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\User\\OneDrive\\Υπολογιστής\\KONSTANTINOS-LAPTOP-C3JUH2VS\\KONSTANTINOS\\PROJECTS\\mlxtend\\mlxtend\\frequent_patterns\\association_rules.py:172: RuntimeWarning: invalid value encountered in divide\n", + " cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftrepresentativityleverageconvictionzhangs_metricjaccardcertaintykulczynski
0(Yogurt)(Kidney Beans)0.6666671.00.6666671.01.00.60.0inf0.00.6666670.00.833333
1(Eggs)(Kidney Beans)0.6666671.00.6666671.01.00.60.0inf0.00.6666670.00.833333
\n", + "
" + ], + "text/plain": [ + " antecedents consequents antecedent support consequent support \\\n", + "0 (Yogurt) (Kidney Beans) 0.666667 1.0 \n", + "1 (Eggs) (Kidney Beans) 0.666667 1.0 \n", + "\n", + " support confidence lift representativity leverage conviction \\\n", + "0 0.666667 1.0 1.0 0.6 0.0 inf \n", + "1 0.666667 1.0 1.0 0.6 0.0 inf \n", + "\n", + " zhangs_metric jaccard certainty kulczynski \n", + "0 0.0 0.666667 0.0 0.833333 \n", + "1 0.0 0.666667 0.0 0.833333 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frequent_itemsets = fpgrowth(df, min_support=0.6, null_values = True, use_colnames=True)\n", + "# frequent_itemsets = fpmax(df, min_support=0.6, null_values = True, use_colnames=True)\n", + "rules = association_rules(frequent_itemsets, df, len(df), null_values = True, metric=\"confidence\", min_threshold=0.8)\n", + "rules" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -480,7 +829,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3.10.6 64-bit", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -494,7 +843,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.7" }, "toc": { "nav_menu": {}, @@ -507,11 +856,6 @@ "toc_position": {}, "toc_section_display": true, "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "bf60121a5b01270aa3349ef43fb139e7fd99866fa1ce176a6b5a191f4d6f9b53" - } } }, "nbformat": 4, diff --git a/docs/sources/user_guide/frequent_patterns/fpgrowth.ipynb b/docs/sources/user_guide/frequent_patterns/fpgrowth.ipynb index e63e88586..279c0e106 100644 --- a/docs/sources/user_guide/frequent_patterns/fpgrowth.ipynb +++ b/docs/sources/user_guide/frequent_patterns/fpgrowth.ipynb @@ -36,7 +36,9 @@ "\n", "In general, the algorithm has been designed to operate on databases containing transactions, such as purchases by customers of a store. An itemset is considered as \"frequent\" if it meets a user-specified support threshold. For instance, if the support threshold is set to 0.5 (50%), a frequent itemset is defined as a set of items that occur together in at least 50% of all transactions in the database.\n", "\n", - "In particular, and what makes it different from the Apriori frequent pattern mining algorithm, FP-Growth is an frequent pattern mining algorithm that does not require candidate generation. Internally, it uses a so-called FP-tree (frequent pattern tree) datastrucure without generating the candidate sets explicitly, which makes it particularly attractive for large datasets." + "In particular, and what makes it different from the Apriori frequent pattern mining algorithm, FP-Growth is an frequent pattern mining algorithm that does not require candidate generation. Internally, it uses a so-called FP-tree (frequent pattern tree) datastrucure without generating the candidate sets explicitly, which makes it particularly attractive for large datasets.\n", + "\n", + "A new feature is implemented in this algorithm, which is the sub-case when the input contains missing information [3]. The same structure and logic of the algorithm is kept, while \"ignoring\" the missing values in the data. That gives a more realistic indication of the frequency of existence in the items/itemsets that are generated from the algorithm. The support is computed differently where for a single item, the cardinality of null values is deducted from the cardinality of all transactions in the database. For the case of an itemset, of more than one elements, the cardinality of null values in at least one item in them itemset is deducted from the cardinality of all transactions in the database. " ] }, { @@ -49,6 +51,8 @@ "\n", "[2] Agrawal, Rakesh, and Ramakrishnan Srikant. \"[Fast algorithms for mining association rules](https://www.it.uu.se/edu/course/homepage/infoutv/ht08/vldb94_rj.pdf).\" Proc. 20th int. conf. very large data bases, VLDB. Vol. 1215. 1994.\n", "\n", + "[3] Ragel, A. and Crémilleux, B., 1998. \"[Treatment of missing values for association rules](https://link.springer.com/chapter/10.1007/3-540-64383-4_22)\". In Research and Development in Knowledge Discovery and Data Mining: Second Pacific-Asia Conference, PAKDD-98 Melbourne, Australia, April 15–17, 1998 Proceedings 2 (pp. 258-270). Springer Berlin Heidelberg.\n", + "\n", "## Related\n", "\n", "- [FP-Max](./fpmax.md)\n", @@ -479,6 +483,261 @@ "fpgrowth(df, min_support=0.6, use_colnames=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below implements the algorithm when there is missing information from the data, by arbitrarily removing datapoints from the original dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_1940\\3278686283.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AppleCornDillEggsIce creamKidney BeansMilkNutmegOnionUnicornYogurt
0FalseFalseFalseTrueFalseTrueTrueTrueTrueNaNNaN
1FalseNaNTrueTrueFalseTrueFalseTrueTrueFalseTrue
2TrueFalseFalseTrueFalseTrueTrueFalseFalseFalseFalse
3FalseTrueFalseFalseNaNNaNTrueNaNFalseNaNTrue
4FalseTrueFalseTrueNaNTrueFalseFalseNaNFalseFalse
\n", + "
" + ], + "text/plain": [ + " Apple Corn Dill Eggs Ice cream Kidney Beans Milk Nutmeg Onion \\\n", + "0 False False False True False True True True True \n", + "1 False NaN True True False True False True True \n", + "2 True False False True False True True False False \n", + "3 False True False False NaN NaN True NaN False \n", + "4 False True False True NaN True False False NaN \n", + "\n", + " Unicorn Yogurt \n", + "0 NaN NaN \n", + "1 False True \n", + "2 False False \n", + "3 NaN True \n", + "4 False False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "from mlxtend.frequent_patterns import fpgrowth\n", + "\n", + "rows, columns = df.shape\n", + "idx = np.random.randint(0, rows, 10)\n", + "col = np.random.randint(0, columns, 10)\n", + "\n", + "for i in range(10):\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The same function as above is applied by setting `null_values=True` with at least 60% support:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsets
01.0(Kidney Beans)
10.8(Eggs)
20.6(Milk)
31.0(Eggs, Kidney Beans)
\n", + "
" + ], + "text/plain": [ + " support itemsets\n", + "0 1.0 (Kidney Beans)\n", + "1 0.8 (Eggs)\n", + "2 0.6 (Milk)\n", + "3 1.0 (Eggs, Kidney Beans)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fpgrowth(df, min_support=0.6, null_values = True, use_colnames=True)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -677,7 +936,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -691,7 +950,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.12.7" }, "toc": { "nav_menu": {}, diff --git a/docs/sources/user_guide/frequent_patterns/fpmax.ipynb b/docs/sources/user_guide/frequent_patterns/fpmax.ipynb index 4e3d5f538..f4ef6865b 100644 --- a/docs/sources/user_guide/frequent_patterns/fpmax.ipynb +++ b/docs/sources/user_guide/frequent_patterns/fpmax.ipynb @@ -37,7 +37,9 @@ "In contrast to Apriori, [FP-Growth](./fpgrowth.md) is a frequent pattern generation algorithm that inserts items into a pattern search tree, which allows it to have a linear increase in runtime with respect to the number of unique items or entries.\n", "\n", "FP-Max is a variant of FP-Growth, which focuses on obtaining maximal itemsets.\n", - "**An itemset X is said to maximal if X is frequent and there exists no frequent super-pattern containing X.** In other words, a frequent pattern X cannot be sub-pattern of larger frequent pattern to qualify for the definition *maximal itemset*." + "**An itemset X is said to maximal if X is frequent and there exists no frequent super-pattern containing X.** In other words, a frequent pattern X cannot be sub-pattern of larger frequent pattern to qualify for the definition *maximal itemset*.\n", + "\n", + "Same as in [FP-Growth](./fpgrowth.md), a new feature is implemented in this algorithm, which is the sub-case when the input contains missing information [2]. The same structure and logic of the algorithm is kept, while \"ignoring\" the missing values in the data. That gives a more realistic indication of the frequency of existence in the items/itemsets that are generated from the algorithm. The support is computed differently where for a single item, the cardinality of null values is deducted from the cardinality of all transactions in the database. For the case of an itemset, of more than one elements, the cardinality of null values in at least one item in them itemset is deducted from the cardinality of all transactions in the database." ] }, { @@ -47,6 +49,7 @@ "## References\n", "\n", "- [1] Grahne, G., & Zhu, J. (2003, November). Efficiently using prefix-trees in mining frequent itemsets. In FIMI (Vol. 90).\n", + "- [2] Ragel, A. and Crémilleux, B., 1998. \"[Treatment of missing values for association rules](https://link.springer.com/chapter/10.1007/3-540-64383-4_22)\". In Research and Development in Knowledge Discovery and Data Mining: Second Pacific-Asia Conference, PAKDD-98 Melbourne, Australia, April 15–17, 1998 Proceedings 2 (pp. 258-270). Springer Berlin Heidelberg.\n", "\n", "## Related\n", "\n", @@ -382,6 +385,243 @@ "fpmax(df, min_support=0.6, use_colnames=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below implements the algorithm when there is missing information from the data, by arbitrarily removing datapoints from the original dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_2788\\2505244757.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n", + " df.iloc[idx[i], col[i]] = np.nan\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AppleCornDillEggsIce creamKidney BeansMilkNutmegOnionUnicornYogurt
0FalseFalseFalseTrueFalseTrueNaNTrueTrueFalseTrue
1FalseNaNTrueTrueNaNTrueFalseTrueTrueFalseNaN
2NaNFalseFalseTrueFalseTrueNaNFalseNaNNaNFalse
3FalseTrueFalseFalseFalseTrueTrueFalseFalseTrueTrue
4FalseTrueFalseTrueTrueTrueFalseFalseNaNFalseFalse
\n", + "
" + ], + "text/plain": [ + " Apple Corn Dill Eggs Ice cream Kidney Beans Milk Nutmeg Onion \\\n", + "0 False False False True False True NaN True True \n", + "1 False NaN True True NaN True False True True \n", + "2 NaN False False True False True NaN False NaN \n", + "3 False True False False False True True False False \n", + "4 False True False True True True False False NaN \n", + "\n", + " Unicorn Yogurt \n", + "0 False True \n", + "1 False NaN \n", + "2 NaN False \n", + "3 True True \n", + "4 False False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "from mlxtend.frequent_patterns import fpmax\n", + "\n", + "rows, columns = df.shape\n", + "idx = np.random.randint(0, rows, 10)\n", + "col = np.random.randint(0, columns, 10)\n", + "\n", + "for i in range(10):\n", + " df.iloc[idx[i], col[i]] = np.nan\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The same function as above is applied by setting `null_values=True` with at least 60% support:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsets
00.666667(Eggs, Kidney Beans, Onion)
\n", + "
" + ], + "text/plain": [ + " support itemsets\n", + "0 0.666667 (Eggs, Kidney Beans, Onion)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fpmax(df, min_support=0.6, null_values = True, use_colnames=True)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -499,7 +739,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -513,7 +753,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.12.7" }, "toc": { "nav_menu": {}, diff --git a/mlxtend/feature_selection/column_selector.py b/mlxtend/feature_selection/column_selector.py index e09c35fdc..39fcc2816 100644 --- a/mlxtend/feature_selection/column_selector.py +++ b/mlxtend/feature_selection/column_selector.py @@ -78,7 +78,7 @@ def transform(self, X, y=None): # We use the loc or iloc accessor if the input is a pandas dataframe if hasattr(X, "loc") or hasattr(X, "iloc"): - if type(self.cols) == tuple: + if isinstance(self.cols, tuple): self.cols = list(self.cols) types = {type(i) for i in self.cols} if len(types) > 1: diff --git a/mlxtend/frequent_patterns/association_rules.py b/mlxtend/frequent_patterns/association_rules.py index 16cba9b3d..c3ca9c249 100644 --- a/mlxtend/frequent_patterns/association_rules.py +++ b/mlxtend/frequent_patterns/association_rules.py @@ -9,16 +9,20 @@ # License: BSD 3 clause from itertools import combinations +from typing import Optional import numpy as np import pandas as pd +from ..frequent_patterns import fpcommon as fpc + _metrics = [ "antecedent support", "consequent support", "support", "confidence", "lift", + "representativity", "leverage", "conviction", "zhangs_metric", @@ -30,6 +34,9 @@ def association_rules( df: pd.DataFrame, + num_itemsets: int, + df_orig: Optional[pd.DataFrame] = None, + null_values=False, metric="confidence", min_threshold=0.8, support_only=False, @@ -44,6 +51,15 @@ def association_rules( pandas DataFrame of frequent itemsets with columns ['support', 'itemsets'] + df_orig : pandas DataFrame (default: None) + DataFrame with original input data. Only provided when null_values exist + + num_itemsets : int + Number of transactions in original input data + + null_values : bool (default: False) + In case there are null values as NaNs in the original input data + metric : string (default: 'confidence') Metric to evaluate if a rule is of interest. **Automatically set to 'support' if `support_only=True`.** @@ -99,6 +115,13 @@ def association_rules( https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/ """ + # if null values exist, df_orig must be provided + if null_values and df_orig is None: + raise TypeError("If null values exist, df_orig must be provided.") + + # check for valid input + fpc.valid_input_check(df_orig, null_values) + if not df.shape[0]: raise ValueError( "The input DataFrame `df` containing " "the frequent itemsets is empty." @@ -111,31 +134,28 @@ def association_rules( columns 'support' and 'itemsets'" ) - def kulczynski_helper(sAC, sA, sC): - conf_AC = sAC / sA - conf_CA = sAC / sC + def kulczynski_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_): + conf_AC = sAC * (num_itemsets - disAC) / (sA * (num_itemsets - disA) - dis_int) + conf_CA = sAC * (num_itemsets - disAC) / (sC * (num_itemsets - disC) - dis_int_) kulczynski = (conf_AC + conf_CA) / 2 return kulczynski - def conviction_helper(sAC, sA, sC): - confidence = sAC / sA - conviction = np.empty(confidence.shape, dtype=float) + def conviction_helper(conf, sC): + conviction = np.empty(conf.shape, dtype=float) if not len(conviction.shape): conviction = conviction[np.newaxis] - confidence = confidence[np.newaxis] - sAC = sAC[np.newaxis] - sA = sA[np.newaxis] + conf = conf[np.newaxis] sC = sC[np.newaxis] conviction[:] = np.inf - conviction[confidence < 1.0] = (1.0 - sC[confidence < 1.0]) / ( - 1.0 - confidence[confidence < 1.0] - ) + conviction[conf < 1.0] = (1.0 - sC[conf < 1.0]) / (1.0 - conf[conf < 1.0]) return conviction - def zhangs_metric_helper(sAC, sA, sC): + def zhangs_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_): denominator = np.maximum(sAC * (1 - sA), sA * (sC - sAC)) - numerator = metric_dict["leverage"](sAC, sA, sC) + numerator = metric_dict["leverage"]( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ) with np.errstate(divide="ignore", invalid="ignore"): # ignoring the divide by 0 warning since it is addressed in the below np.where @@ -143,15 +163,20 @@ def zhangs_metric_helper(sAC, sA, sC): return zhangs_metric - def jaccard_metric_helper(sAC, sA, sC): - numerator = metric_dict["support"](sAC, sA, sC) + def jaccard_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_): + numerator = metric_dict["support"]( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ) denominator = sA + sC - numerator jaccard_metric = numerator / denominator return jaccard_metric - def certainty_metric_helper(sAC, sA, sC): - certainty_num = metric_dict["confidence"](sAC, sA, sC) - sC + def certainty_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_): + certainty_num = ( + metric_dict["confidence"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) + - sC + ) certainty_denom = 1 - sC cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom) @@ -159,17 +184,43 @@ def certainty_metric_helper(sAC, sA, sC): # metrics for association rules metric_dict = { - "antecedent support": lambda _, sA, __: sA, - "consequent support": lambda _, __, sC: sC, - "support": lambda sAC, _, __: sAC, - "confidence": lambda sAC, sA, _: sAC / sA, - "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC) / sC, - "leverage": lambda sAC, sA, sC: metric_dict["support"](sAC, sA, sC) - sA * sC, - "conviction": lambda sAC, sA, sC: conviction_helper(sAC, sA, sC), - "zhangs_metric": lambda sAC, sA, sC: zhangs_metric_helper(sAC, sA, sC), - "jaccard": lambda sAC, sA, sC: jaccard_metric_helper(sAC, sA, sC), - "certainty": lambda sAC, sA, sC: certainty_metric_helper(sAC, sA, sC), - "kulczynski": lambda sAC, sA, sC: kulczynski_helper(sAC, sA, sC), + "antecedent support": lambda _, sA, ___, ____, _____, ______, _______, ________: sA, + "consequent support": lambda _, __, sC, ____, _____, ______, _______, ________: sC, + "support": lambda sAC, _, __, ___, ____, _____, ______, _______: sAC, + "confidence": lambda sAC, sA, _, disAC, disA, __, dis_int, ___: ( + sAC * (num_itemsets - disAC) + ) + / (sA * (num_itemsets - disA) - dis_int), + "lift": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: metric_dict[ + "confidence" + ](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) + / sC, + "representativity": lambda _, __, ___, disAC, ____, ______, _______, ________: ( + num_itemsets - disAC + ) + / num_itemsets, + "leverage": lambda sAC, sA, sC, _, __, ____, _____, ______: metric_dict[ + "support" + ](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) + - sA * sC, + "conviction": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: conviction_helper( + metric_dict["confidence"]( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ), + sC, + ), + "zhangs_metric": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: zhangs_metric_helper( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ), + "jaccard": lambda sAC, sA, sC, _, __, ____, _____, ______: jaccard_metric_helper( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ), + "certainty": lambda sAC, sA, sC, _, __, ____, _____, ______: certainty_metric_helper( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ), + "kulczynski": lambda sAC, sA, sC, _, __, ____, _____, ______: kulczynski_helper( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ), } # check for metric compliance @@ -192,6 +243,23 @@ def certainty_metric_helper(sAC, sA, sC): rule_consequents = [] rule_supports = [] + # Define the disabled df, assign columns from original df to be the same on the disabled. + if null_values: + disabled = df_orig.copy() + disabled = np.where(pd.isna(disabled), 1, np.nan) + np.where( + (disabled == 0) | (disabled == 1), np.nan, 0 + ) + disabled = pd.DataFrame(disabled) + if all(isinstance(key, str) for key in list(frequent_items_dict.keys())[0]): + disabled.columns = df_orig.columns + + if all( + isinstance(key, np.int64) for key in list(frequent_items_dict.keys())[0] + ): + cols = np.arange(0, len(df_orig.columns), 1) + disabled.columns = cols + df_orig.columns = cols + # iterate over all frequent itemsets for k in frequent_items_dict.keys(): sAC = frequent_items_dict[k] @@ -207,11 +275,64 @@ def certainty_metric_helper(sAC, sA, sC): # hence, placeholders should suffice sA = None sC = None + disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0 else: try: sA = frequent_items_dict[antecedent] sC = frequent_items_dict[consequent] + + # if the input dataframe is complete + if not null_values: + disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0 + num_itemsets = 1 + + else: + an = list(antecedent) + con = list(consequent) + an.extend(con) + + # select data of antecedent, consequent and combined from disabled + dec = disabled.loc[:, an] + _dec = disabled.loc[:, list(antecedent)] + __dec = disabled.loc[:, list(consequent)] + + # select data of antecedent and consequent from original + dec_ = df_orig.loc[:, list(antecedent)] + dec__ = df_orig.loc[:, list(consequent)] + + # disabled counts + disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0 + for i in range(len(dec.index)): + # select the i-th iset from the disabled dataset + item_comb = list(dec.iloc[i, :]) + item_dis_an = list(_dec.iloc[i, :]) + item_dis_con = list(__dec.iloc[i, :]) + + # select the i-th iset from the original dataset + item_or_an = list(dec_.iloc[i, :]) + item_or_con = list(dec__.iloc[i, :]) + + # check and keep count if there is a null value in combined, antecedent, consequent + if 1 in set(item_comb): + disAC += 1 + if 1 in set(item_dis_an): + disA += 1 + if 1 in item_dis_con: + disC += 1 + + # check and keep count if there is a null value in consequent AND all items are present in antecedent + if (1 in item_dis_con) and all( + j == 1 for j in item_or_an + ): + dis_int += 1 + + # check and keep count if there is a null value in antecedent AND all items are present in consequent + if (1 in item_dis_an) and all( + j == 1 for j in item_or_con + ): + dis_int_ += 1 + except KeyError as e: s = ( str(e) + "You are likely getting this error" @@ -224,11 +345,15 @@ def certainty_metric_helper(sAC, sA, sC): raise KeyError(s) # check for the threshold - score = metric_dict[metric](sAC, sA, sC) + score = metric_dict[metric]( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ) if score >= min_threshold: rule_antecedents.append(antecedent) rule_consequents.append(consequent) - rule_supports.append([sAC, sA, sC]) + rule_supports.append( + [sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_] + ) # check if frequent rule was generated if not rule_supports: @@ -252,7 +377,15 @@ def certainty_metric_helper(sAC, sA, sC): sAC = rule_supports[0] sA = rule_supports[1] sC = rule_supports[2] + disAC = rule_supports[3] + disA = rule_supports[4] + disC = rule_supports[5] + dis_int = rule_supports[6] + dis_int_ = rule_supports[7] + for m in return_metrics: - df_res[m] = metric_dict[m](sAC, sA, sC) + df_res[m] = metric_dict[m]( + sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_ + ) return df_res diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py index b1a514cd1..fb0ed2ac3 100644 --- a/mlxtend/frequent_patterns/fpcommon.py +++ b/mlxtend/frequent_patterns/fpcommon.py @@ -25,9 +25,16 @@ def setup_fptree(df, min_support): # support of each individual item # if itemsets is sparse, np.sum returns an np.matrix of shape (1, N) - item_support = np.array(np.sum(itemsets, axis=0) / float(num_itemsets)) + disabled = df.copy() + disabled = np.where(pd.isna(disabled), 1, np.nan) + np.where( + (disabled == 0) | (disabled == 1), np.nan, 0 + ) + + item_support = np.array( + np.sum(np.logical_or(df.values == 1, df.values is True), axis=0) + / (float(num_itemsets) - np.nansum(disabled, axis=0)) + ) item_support = item_support.reshape(-1) - items = np.nonzero(item_support >= min_support)[0] # Define ordering on items for inserting into FPTree @@ -58,17 +65,50 @@ def setup_fptree(df, min_support): itemset.sort(key=rank.get, reverse=True) tree.insert_itemset(itemset) - return tree, rank + return tree, disabled, rank -def generate_itemsets(generator, num_itemsets, colname_map): +def generate_itemsets(generator, df, disabled, min_support, num_itemsets, colname_map): itemsets = [] supports = [] for sup, iset in generator: itemsets.append(frozenset(iset)) - supports.append(sup / num_itemsets) + # select data of iset from disabled dataset + dec = disabled[:, iset] + # select data of iset from original dataset + _dec = df.values[:, iset] + + # case if iset only has one element + if len(iset) == 1: + supports.append((sup - np.nansum(dec)) / (num_itemsets - np.nansum(dec))) + + # case if iset has multiple elements + elif len(iset) > 1: + denom = 0 + num = 0 + for i in range(dec.shape[0]): + # select the i-th iset from disabled dataset + item_dsbl = list(dec[i, :]) + # select the i-th iset from original dataset + item_orig = list(_dec[i, :]) + + # check and keep count if there is a null value in iset of disabled + if 1 in set(item_dsbl): + denom += 1 + + # check and keep count if item doesn't exist OR all values are null in iset of original + if (0 not in set(item_orig)) or ( + all(np.isnan(x) for x in item_orig) + ): + num -= 1 + + if num_itemsets - denom == 0: + supports.append(0) + else: + supports.append((sup + num) / (num_itemsets - denom)) res_df = pd.DataFrame({"support": supports, "itemsets": itemsets}) + res_df = res_df[res_df["support"] >= min_support] if colname_map is not None: res_df["itemsets"] = res_df["itemsets"].apply( @@ -78,7 +118,11 @@ def generate_itemsets(generator, num_itemsets, colname_map): return res_df -def valid_input_check(df): +def valid_input_check(df, null_values=False): + # Return early if df is None + if df is None: + return + if f"{type(df)}" == "": msg = ( "SparseDataFrame support has been deprecated in pandas 1.0," @@ -104,7 +148,15 @@ def valid_input_check(df): ) # Fast path: if all columns are boolean, there is nothing to checks - all_bools = df.dtypes.apply(pd.api.types.is_bool_dtype).all() + if null_values: + all_bools = ( + df.apply(lambda col: col.apply(lambda x: pd.isna(x) or isinstance(x, bool))) + .all() + .all() + ) + else: + all_bools = df.dtypes.apply(pd.api.types.is_bool_dtype).all() + if not all_bools: warnings.warn( "DataFrames with non-bool types result in worse computational" @@ -112,6 +164,20 @@ def valid_input_check(df): "Please use a DataFrame with bool type", DeprecationWarning, ) + + # If null_values is True but no NaNs are found, raise an error + has_nans = pd.isna(df).any().any() + if null_values and not has_nans: + warnings.warn( + "null_values=True is inefficient when there are no NaN values in the DataFrame." + "Set null_values=False for faster output." + ) + # If null_values is False but NaNs are found, raise an error + if not null_values and has_nans: + raise ValueError( + "NaN values are not permitted in the DataFrame when null_values=False." + ) + # Pandas is much slower than numpy, so use np.where on Numpy arrays if hasattr(df, "sparse"): if df.size == 0: @@ -120,7 +186,13 @@ def valid_input_check(df): values = df.sparse.to_coo().tocoo().data else: values = df.values - idxs = np.where((values != 1) & (values != 0)) + + # Ignore NaNs if null_values is True + if null_values: + idxs = np.where((values != 1) & (values != 0) & (~np.isnan(values))) + else: + idxs = np.where((values != 1) & (values != 0)) + if len(idxs[0]) > 0: # idxs has 1 dimension with sparse data and 2 with dense data val = values[tuple(loc[0] for loc in idxs)] @@ -128,6 +200,12 @@ def valid_input_check(df): "The allowed values for a DataFrame" " are True, False, 0, 1. Found value %s" % (val) ) + + if null_values: + s = ( + "The allowed values for a DataFrame" + " are True, False, 0, 1, NaN. Found value %s" % (val) + ) raise ValueError(s) diff --git a/mlxtend/frequent_patterns/fpgrowth.py b/mlxtend/frequent_patterns/fpgrowth.py index f2f4914dd..669672278 100644 --- a/mlxtend/frequent_patterns/fpgrowth.py +++ b/mlxtend/frequent_patterns/fpgrowth.py @@ -9,7 +9,9 @@ from ..frequent_patterns import fpcommon as fpc -def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): +def fpgrowth( + df, min_support=0.5, null_values=False, use_colnames=False, max_len=None, verbose=0 +): """Get frequent itemsets from a one-hot DataFrame Parameters @@ -42,6 +44,9 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): The support is computed as the fraction transactions_where_item(s)_occur / total_transactions. + null_values : bool (default: False) + In case there are null values as NaNs in the original input data + use_colnames : bool (default: False) If true, uses the DataFrames' column names in the returned DataFrame instead of column indices. @@ -70,7 +75,7 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/ """ - fpc.valid_input_check(df) + fpc.valid_input_check(df, null_values) if min_support <= 0.0: raise ValueError( @@ -83,11 +88,13 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): if use_colnames: colname_map = {idx: item for idx, item in enumerate(df.columns)} - tree, _ = fpc.setup_fptree(df, min_support) + tree, disabled, _ = fpc.setup_fptree(df, min_support) minsup = math.ceil(min_support * len(df.index)) # min support as count generator = fpg_step(tree, minsup, colname_map, max_len, verbose) - return fpc.generate_itemsets(generator, len(df.index), colname_map) + return fpc.generate_itemsets( + generator, df, disabled, min_support, len(df.index), colname_map + ) def fpg_step(tree, minsup, colnames, max_len, verbose): diff --git a/mlxtend/frequent_patterns/fpmax.py b/mlxtend/frequent_patterns/fpmax.py index c22db3f9a..db1b3ac45 100644 --- a/mlxtend/frequent_patterns/fpmax.py +++ b/mlxtend/frequent_patterns/fpmax.py @@ -9,7 +9,9 @@ from ..frequent_patterns import fpcommon as fpc -def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): +def fpmax( + df, min_support=0.5, null_values=False, use_colnames=False, max_len=None, verbose=0 +): """Get maximal frequent itemsets from a one-hot DataFrame Parameters @@ -43,6 +45,9 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): The support is computed as the fraction transactions_where_item(s)_occur / total_transactions. + null_values : bool (default: True) + In case there are null values as NaNs in the original input data + use_colnames : bool (default: False) If true, uses the DataFrames' column names in the returned DataFrame instead of column indices. @@ -72,7 +77,7 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/ """ - fpc.valid_input_check(df) + fpc.valid_input_check(df, null_values) if min_support <= 0.0: raise ValueError( @@ -85,12 +90,14 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): if use_colnames: colname_map = {idx: item for idx, item in enumerate(df.columns)} - tree, rank = fpc.setup_fptree(df, min_support) + tree, disabled, rank = fpc.setup_fptree(df, min_support) minsup = math.ceil(min_support * len(df)) # min support as count generator = fpmax_step(tree, minsup, MFITree(rank), colname_map, max_len, verbose) - return fpc.generate_itemsets(generator, len(df), colname_map) + return fpc.generate_itemsets( + generator, df, disabled, min_support, len(df), colname_map + ) def fpmax_step(tree, minsup, mfit, colnames, max_len, verbose): diff --git a/mlxtend/frequent_patterns/tests/test_association_rules.py b/mlxtend/frequent_patterns/tests/test_association_rules.py index 1035183c9..77c6e19e3 100644 --- a/mlxtend/frequent_patterns/tests/test_association_rules.py +++ b/mlxtend/frequent_patterns/tests/test_association_rules.py @@ -3,7 +3,7 @@ import pytest from numpy.testing import assert_raises as numpy_assert_raises -from mlxtend.frequent_patterns import apriori, association_rules +from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth one_ary = np.array( [ @@ -43,6 +43,7 @@ "support", "confidence", "lift", + "representativity", "leverage", "conviction", "zhangs_metric", @@ -54,7 +55,7 @@ # fmt: off def test_default(): - res_df = association_rules(df_freq_items) + res_df = association_rules(df_freq_items, len(df)) res_df["antecedents"] = res_df["antecedents"].apply(lambda x: str(frozenset(x))) res_df["consequents"] = res_df["consequents"].apply(lambda x: str(frozenset(x))) res_df.sort_values(columns_ordered, inplace=True) @@ -62,15 +63,15 @@ def test_default(): expect = pd.DataFrame( [ - [(8,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], - [(6,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], - [(8, 3), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], - [(8, 5), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], - [(8,), (3, 5), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], - [(3,), (5,), 0.8, 1.0, 0.8, 1.0, 1.0, 0.0, np.inf, 0, 0.8, 0.0, 0.9], - [(5,), (3,), 1.0, 0.8, 0.8, 0.8, 1.0, 0.0, 1.0, 0.0, 0.8, 0.0, 0.9], - [(10,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], - [(8,), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], + [(8,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], + [(6,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], + [(8, 3), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], + [(8, 5), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 1.0, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], + [(8,), (3, 5), 0.6, 0.8, 0.6, 1.0, 1.25, 1.0, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], + [(3,), (5,), 0.8, 1.0, 0.8, 1.0, 1.0, 1.0, 0.0, np.inf, 0, 0.8, 0.0, 0.9], + [(5,), (3,), 1.0, 0.8, 0.8, 0.8, 1.0, 1.0, 0.0, 1.0, 0.0, 0.8, 0.0, 0.9], + [(10,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 1.0, 0.0, np.inf, 0, 0.6, 0.0, 0.8], + [(8,), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 1.0, 0.12, np.inf, 0.5, 0.75, 1.0, 0.875], ], columns=columns_ordered, @@ -84,8 +85,199 @@ def test_default(): # fmt: on +def test_nullability(): + rows, columns = df.shape + nan_idxs = list(range(rows)) + list(range(3, 0, -1)) + list(range(3)) + for i, j in zip(nan_idxs, range(columns)): + df.iloc[i, j] = np.nan + + df_fp_items = fpgrowth(df, min_support=0.6, null_values=True) + res_df = association_rules( + df_fp_items, len(df), df, null_values=True, min_threshold=0.6 + ) + res_df["antecedents"] = res_df["antecedents"].apply(lambda x: str(frozenset(x))) + res_df["consequents"] = res_df["consequents"].apply(lambda x: str(frozenset(x))) + res_df.sort_values(columns_ordered, inplace=True) + res_df.reset_index(inplace=True, drop=True) + res_df = round(res_df, 3) + + expect = pd.DataFrame( + [ + [ + (10, 3), + (5,), + 0.667, + 1.0, + 0.667, + 1.0, + 1.0, + 0.6, + 0.0, + np.inf, + 0, + 0.667, + 0, + 0.833, + ], + [ + (10, 5), + (3,), + 0.667, + 1.0, + 0.667, + 1.0, + 1.0, + 0.6, + 0.0, + np.inf, + 0, + 0.667, + 0.0, + 0.833, + ], + [ + (10,), + (3, 5), + 0.75, + 1.0, + 0.667, + 1.0, + 1.0, + 0.6, + -0.083, + np.inf, + -0.333, + 0.615, + 0.0, + 0.833, + ], + [ + (10,), + (3,), + 0.75, + 1.0, + 0.667, + 1.0, + 1.0, + 0.6, + -0.083, + np.inf, + -0.333, + 0.615, + 0.0, + 0.833, + ], + [ + (10,), + (5,), + 0.75, + 1.0, + 0.667, + 1.0, + 1.0, + 0.6, + -0.083, + np.inf, + -0.333, + 0.615, + 0, + 0.833, + ], + [ + (3, 5), + (10,), + 1.0, + 0.75, + 0.667, + 0.667, + 0.889, + 0.6, + -0.083, + 0.75, + -1.0, + 0.615, + -0.333, + 0.833, + ], + [ + (3,), + (10, 5), + 1.0, + 0.667, + 0.667, + 0.667, + 1.0, + 0.6, + 0.0, + 1.0, + 0, + 0.667, + 0.0, + 0.833, + ], + [ + (3,), + (10,), + 1.0, + 0.75, + 0.667, + 0.667, + 0.889, + 0.6, + -0.083, + 0.75, + -1.0, + 0.615, + -0.333, + 0.833, + ], + [(3,), (5,), 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.0, np.inf, 0, 1.0, 0, 1.0], + [ + (5,), + (10, 3), + 1.0, + 0.667, + 0.667, + 0.667, + 1.0, + 0.6, + 0.0, + 1.0, + 0, + 0.667, + 0, + 0.833, + ], + [ + (5,), + (10,), + 1.0, + 0.75, + 0.667, + 0.667, + 0.889, + 0.6, + -0.083, + 0.75, + -1.0, + 0.615, + -0.333, + 0.833, + ], + [(5,), (3,), 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.0, np.inf, 0, 1.0, 0.0, 1.0], + ], + columns=columns_ordered, + ) + + expect["antecedents"] = expect["antecedents"].apply(lambda x: str(frozenset(x))) + expect["consequents"] = expect["consequents"].apply(lambda x: str(frozenset(x))) + expect.sort_values(columns_ordered, inplace=True) + expect.reset_index(inplace=True, drop=True) + assert res_df.equals(expect), res_df + + def test_datatypes(): - res_df = association_rules(df_freq_items) + res_df = association_rules(df_freq_items, len(df)) for i in res_df["antecedents"]: assert isinstance(i, frozenset) is True @@ -100,7 +292,7 @@ def test_datatypes(): lambda x: set(x) ) - res_df = association_rules(df_freq_items) + res_df = association_rules(df_freq_items, len(df)) for i in res_df["antecedents"]: assert isinstance(i, frozenset) is True @@ -110,16 +302,18 @@ def test_datatypes(): def test_no_support_col(): df_no_support_col = df_freq_items.loc[:, ["itemsets"]] - numpy_assert_raises(ValueError, association_rules, df_no_support_col) + numpy_assert_raises(ValueError, association_rules, df_no_support_col, len(df)) def test_no_itemsets_col(): df_no_itemsets_col = df_freq_items.loc[:, ["support"]] - numpy_assert_raises(ValueError, association_rules, df_no_itemsets_col) + numpy_assert_raises(ValueError, association_rules, df_no_itemsets_col, len(df)) def test_wrong_metric(): - numpy_assert_raises(ValueError, association_rules, df_freq_items, "unicorn") + numpy_assert_raises( + ValueError, association_rules, df_freq_items, len(df), None, False, "unicorn" + ) def test_empty_result(): @@ -132,6 +326,7 @@ def test_empty_result(): "support", "confidence", "lift", + "representativity", "leverage", "conviction", "zhangs_metric", @@ -140,82 +335,109 @@ def test_empty_result(): "kulczynski", ] ) - res_df = association_rules(df_freq_items, min_threshold=2) + res_df = association_rules(df_freq_items, len(df), min_threshold=2) assert res_df.equals(expect) def test_leverage(): - res_df = association_rules(df_freq_items, min_threshold=0.1, metric="leverage") + res_df = association_rules( + df_freq_items, len(df), min_threshold=0.1, metric="leverage" + ) assert res_df.values.shape[0] == 6 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=0.1, metric="leverage" + df_freq_items_with_colnames, len(df), min_threshold=0.1, metric="leverage" ) assert res_df.values.shape[0] == 6 def test_conviction(): - res_df = association_rules(df_freq_items, min_threshold=1.5, metric="conviction") + res_df = association_rules( + df_freq_items, len(df), min_threshold=1.5, metric="conviction" + ) assert res_df.values.shape[0] == 11 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=1.5, metric="conviction" + df_freq_items_with_colnames, len(df), min_threshold=1.5, metric="conviction" ) assert res_df.values.shape[0] == 11 def test_lift(): - res_df = association_rules(df_freq_items, min_threshold=1.1, metric="lift") + res_df = association_rules(df_freq_items, len(df), min_threshold=1.1, metric="lift") assert res_df.values.shape[0] == 6 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=1.1, metric="lift" + df_freq_items_with_colnames, len(df), min_threshold=1.1, metric="lift" ) assert res_df.values.shape[0] == 6 def test_confidence(): - res_df = association_rules(df_freq_items, min_threshold=0.8, metric="confidence") + res_df = association_rules( + df_freq_items, len(df), min_threshold=0.8, metric="confidence" + ) assert res_df.values.shape[0] == 9 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=0.8, metric="confidence" + df_freq_items_with_colnames, len(df), min_threshold=0.8, metric="confidence" ) assert res_df.values.shape[0] == 9 +def test_representativity(): + res_df = association_rules( + df_freq_items, len(df), min_threshold=1.0, metric="representativity" + ) + assert res_df.values.shape[0] == 16 + + res_df = association_rules( + df_freq_items_with_colnames, + len(df), + min_threshold=1.0, + metric="representativity", + ) + assert res_df.values.shape[0] == 16 + + def test_jaccard(): - res_df = association_rules(df_freq_items, min_threshold=0.7, metric="jaccard") + res_df = association_rules( + df_freq_items, len(df), min_threshold=0.7, metric="jaccard" + ) assert res_df.values.shape[0] == 8 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=0.7, metric="jaccard" + df_freq_items_with_colnames, len(df), min_threshold=0.7, metric="jaccard" ) assert res_df.values.shape[0] == 8 def test_certainty(): - res_df = association_rules(df_freq_items, metric="certainty", min_threshold=0.6) + res_df = association_rules( + df_freq_items, len(df), metric="certainty", min_threshold=0.6 + ) assert res_df.values.shape[0] == 3 res_df = association_rules( - df_freq_items_with_colnames, metric="certainty", min_threshold=0.6 + df_freq_items_with_colnames, len(df), metric="certainty", min_threshold=0.6 ) assert res_df.values.shape[0] == 3 def test_kulczynski(): - res_df = association_rules(df_freq_items, metric="kulczynski", min_threshold=0.9) + res_df = association_rules( + df_freq_items, len(df), metric="kulczynski", min_threshold=0.9 + ) assert res_df.values.shape[0] == 2 res_df = association_rules( - df_freq_items_with_colnames, metric="kulczynski", min_threshold=0.6 + df_freq_items_with_colnames, len(df), metric="kulczynski", min_threshold=0.6 ) assert res_df.values.shape[0] == 16 def test_frozenset_selection(): - res_df = association_rules(df_freq_items) + res_df = association_rules(df_freq_items, len(df)) sel = res_df[res_df["consequents"] == frozenset((3, 5))] assert sel.values.shape[0] == 1 @@ -231,17 +453,17 @@ def test_frozenset_selection(): def test_override_metric_with_support(): - res_df = association_rules(df_freq_items_with_colnames, min_threshold=0.8) + res_df = association_rules(df_freq_items_with_colnames, len(df), min_threshold=0.8) # default metric is confidence assert res_df.values.shape[0] == 9 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=0.8, metric="support" + df_freq_items_with_colnames, len(df), min_threshold=0.8, metric="support" ) assert res_df.values.shape[0] == 2 res_df = association_rules( - df_freq_items_with_colnames, min_threshold=0.8, support_only=True + df_freq_items_with_colnames, len(df), min_threshold=0.8, support_only=True ) assert res_df.values.shape[0] == 2 @@ -272,9 +494,9 @@ def test_on_df_with_missing_entries(): ], } - df = pd.DataFrame(dict) + df_missing = pd.DataFrame(dict) - numpy_assert_raises(KeyError, association_rules, df) + numpy_assert_raises(KeyError, association_rules, df_missing, len(df)) def test_on_df_with_missing_entries_support_only(): @@ -303,14 +525,16 @@ def test_on_df_with_missing_entries_support_only(): ], } - df = pd.DataFrame(dict) - df_result = association_rules(df, support_only=True, min_threshold=0.1) + df_missing = pd.DataFrame(dict) + df_result = association_rules( + df_missing, len(df), support_only=True, min_threshold=0.1 + ) assert df_result["support"].shape == (18,) assert int(np.isnan(df_result["support"].values).any()) != 1 def test_with_empty_dataframe(): - df = df_freq_items_with_colnames.iloc[:0] + df_freq = df_freq_items_with_colnames.iloc[:0] with pytest.raises(ValueError): - association_rules(df) + association_rules(df_freq, len(df)) From a78bd0bfe4f1f6fab0723a7da817bba107ff6650 Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Sun, 3 Nov 2024 22:22:20 +0100 Subject: [PATCH 2/7] SFS finalize_fit() support for numpy >= 2.0 (#1107) * finalize_fit() support for numpy >= 2.0 * updated changelog * flake8: removed blank line * flake8: removed trailing whitespaces * fixed changelog indents * fix for isort / black formatter * added change/feature description --- docs/sources/CHANGELOG.md | 6 ++++-- mlxtend/feature_selection/sequential_feature_selector.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index efbf4fe61..3a0cca67c 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -23,9 +23,11 @@ Files updated: - ['mlxtend.frequent_patterns.fpcommon'] - ['mlxtend.frequent_patterns.fpgrowth'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/) - ['mlxtend.frequent_patterns.fpmax'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/) + - [`mlxtend.feature_selection.SequentialFeatureSelector`](https://github.com/rasbt/mlxtend/blob/master/mlxtend/feature_selection/sequential_feature_selector.py) + - Updated negative infinity constant to be compatible with old and new (>=2.0) `numpy` versions - [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) -- [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/)Implemented three new metrics: Jaccard, Certainty, and Kulczynski. ([#1096](https://github.com/rasbt/mlxtend/issues/1096)) -- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via[it176131](https://github.com/it176131)) + - Implemented three new metrics: Jaccard, Certainty, and Kulczynski. ([#1096](https://github.com/rasbt/mlxtend/issues/1096)) + - Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via [it176131](https://github.com/it176131)) ##### Changes diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index 0a766ed16..590af7b81 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -651,7 +651,11 @@ def fit(self, X, y, groups=None, **fit_params): return self def finalize_fit(self): - max_score = np.NINF + if np.__version__ < "2.0": + ninf = np.NINF + else: + ninf = -np.inf + max_score = ninf for k in self.subsets_: if ( k >= self.min_k @@ -662,7 +666,7 @@ def finalize_fit(self): best_subset = k k_score = max_score - if k_score == np.NINF: + if k_score == ninf: # i.e. all keys of self.subsets_ are not in interval `[self.min_k, self.max_k]` # this happens if KeyboardInterrupt happens keys = list(self.subsets_.keys()) From 8e807789ce817086c64b843d502f575a4d706db3 Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:56:30 +0100 Subject: [PATCH 3/7] Fixed `_calc_score` for *scikit-learn* version compatibility (#1109) * added sklearn version compability for fit_params * added change to changelog * fixed format * fixed order of imports * formatted with isort and black --- docs/sources/CHANGELOG.md | 2 ++ mlxtend/feature_selection/utilities.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 3a0cca67c..daeedb1e9 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -23,6 +23,8 @@ Files updated: - ['mlxtend.frequent_patterns.fpcommon'] - ['mlxtend.frequent_patterns.fpgrowth'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/) - ['mlxtend.frequent_patterns.fpmax'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/) + - ['mlxtend/feature_selection/utilities.py'](https://github.com/rasbt/mlxtend/blob/master/mlxtend/feature_selection/utilities.py) + - Modified `_calc_score` function to ensure compatibility with *scikit-learn* versions 1.4 and above by dynamically selecting between `fit_params` and `params` in `cross_val_score`. - [`mlxtend.feature_selection.SequentialFeatureSelector`](https://github.com/rasbt/mlxtend/blob/master/mlxtend/feature_selection/sequential_feature_selector.py) - Updated negative infinity constant to be compatible with old and new (>=2.0) `numpy` versions - [`mlxtend.frequent_patterns.association_rules`](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/) diff --git a/mlxtend/feature_selection/utilities.py b/mlxtend/feature_selection/utilities.py index 4290401e3..4d9c8bd6a 100644 --- a/mlxtend/feature_selection/utilities.py +++ b/mlxtend/feature_selection/utilities.py @@ -1,6 +1,5 @@ -from copy import deepcopy - import numpy as np +from sklearn import __version__ as sklearn_version from sklearn.model_selection import cross_val_score @@ -94,6 +93,9 @@ def _calc_score( feature_groups = [[i] for i in range(X.shape[1])] IDX = _merge_lists(feature_groups, indices) + + param_name = "fit_params" if sklearn_version < "1.4" else "params" + if selector.cv: scores = cross_val_score( selector.est_, @@ -104,7 +106,7 @@ def _calc_score( scoring=selector.scorer, n_jobs=1, pre_dispatch=selector.pre_dispatch, - fit_params=fit_params, + **{param_name: fit_params}, ) else: selector.est_.fit(X[:, IDX], y, **fit_params) From a5a117a32ef458f0f129a9e9c2d14d468147f480 Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Tue, 5 Nov 2024 15:06:45 +0100 Subject: [PATCH 4/7] updated CI/CD workflows (#1108) * updated CI/CD workflows * Update .github/workflows/python-package-conda.yml --------- Co-authored-by: Sebastian Raschka --- .github/workflows/linter.yml | 4 ++-- .github/workflows/python-package-conda.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 2c33822b8..a52ffb60d 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -6,9 +6,9 @@ jobs: formatting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install isort, black, and flake8 diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index fd29dc19c..5298e70f3 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -14,9 +14,9 @@ jobs: env: GITHUB_ACTIONS_CI: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Miniconda - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true python-version: 3.9 @@ -41,4 +41,4 @@ jobs: coverage xml - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v2 \ No newline at end of file + uses: codecov/codecov-action@v4 From 0cfcfab87176f5b73f44a8c1f278eabb23889dce Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Tue, 5 Nov 2024 08:10:39 -0600 Subject: [PATCH 5/7] Add PyPI deploy workflow (#1110) --- .github/workflows/publish.yaml | 40 ++++++++++++++++++++++++++++++++++ mlxtend/__init__.py | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/publish.yaml diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml new file mode 100644 index 000000000..0b825fe4e --- /dev/null +++ b/.github/workflows/publish.yaml @@ -0,0 +1,40 @@ +# To create a release, create a tag and push it to GitHub: +#git tag -a "v0.0.1-beta" -m "beta version testing" +#git push --tags +# https://dev.to/iamtekson/publish-package-to-pypi-and-release-new-version-using-github-actions-108k +name: Publish MLxtend to PyPI + +on: + push: + tags: + - "v*" +jobs: + build-n-publish: + name: Build and publish to PyPI + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/mlxtend + permissions: + id-token: write + + steps: + - name: Checkout source + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + + - name: Build source and wheel distributions + run: | + python -m pip install --upgrade build twine + pip install importlib_metadata==7.2.1 + python -m build + twine check --strict dist/* + - name: Publish distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/mlxtend/__init__.py b/mlxtend/__init__.py index 19faabe38..8be818a90 100644 --- a/mlxtend/__init__.py +++ b/mlxtend/__init__.py @@ -4,4 +4,4 @@ # # License: BSD 3 clause -__version__ = "0.23.2dev" +__version__ = "0.23.2" From c229178f9201f579e3b82402426a945c113122ef Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 5 Nov 2024 08:13:31 -0600 Subject: [PATCH 6/7] update changelog for new version --- docs/sources/CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index daeedb1e9..705a9844d 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -8,7 +8,7 @@ The CHANGELOG for the current development version is available at --- -### Version 0.23.2 (TBD) +### Version 0.23.2 (5 Nov 2024) ##### Downloads @@ -19,7 +19,7 @@ The CHANGELOG for the current development version is available at ##### New Features and Enhancements - Implement the FP-Growth and FP-Max algorithms with the possibility of missing values in the input dataset. Added a new metric Representativity for the association rules generated ([#1004](https://github.com/rasbt/mlxtend/issues/1004) via [zazass8](https://github.com/zazass8)). -Files updated: + Files updated: - ['mlxtend.frequent_patterns.fpcommon'] - ['mlxtend.frequent_patterns.fpgrowth'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/) - ['mlxtend.frequent_patterns.fpmax'](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/) From ec40b758254c7531a85c8a36a91ea8bdb1298a1f Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:55:42 +0100 Subject: [PATCH 7/7] Improved `plot_splits` for time series splits (#1113) * fixed test_end_idx calculation * added legend title * fixed intersection of xticklabels for Sample index * added changes to changelog * Update CHANGELOG.md * removed redundancy * start x-axis at the origin of coordinates * fixed CV iteration labels * revert idx change * Update CHANGELOG.md * add groups legend * fixed cmap in groups legend --- docs/sources/CHANGELOG.md | 13 +++++++++++++ mlxtend/evaluate/time_series.py | 27 +++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 705a9844d..6b2bce57c 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -7,6 +7,19 @@ The CHANGELOG for the current development version is available at --- +### Version 0.23.3 (tbd) + +##### Downloads +... + +##### New Features and Enhancements + +Files updated: + - ['mlxtend.evaluate.time_series.plot_splits'](https://github.com/rasbt/mlxtend/blob/master/mlxtend/evaluate/time_series.py) + - Improved `plot_splits` for better visualization of time series splits + +##### Changes +... ### Version 0.23.2 (5 Nov 2024) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index fb2d97169..58b5149ed 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -290,7 +290,7 @@ def plot_split_indices(cv, cv_args, X, y, groups, n_splits, image_file_path=None s=marker_size, ) - yticklabels = list(range(n_splits)) + ["group"] + yticklabels = list(range(1, n_splits + 1)) + ["group"] ax.set( yticks=np.arange(n_splits + 1) + 0.5, yticklabels=yticklabels, @@ -299,15 +299,34 @@ def plot_split_indices(cv, cv_args, X, y, groups, n_splits, image_file_path=None xlim=[-0.5, len(indices) - 0.5], ) - ax.legend( + legend_splits = ax.legend( [Patch(color=cmap_cv(0.2)), Patch(color=cmap_cv(0.8))], ["Training set", "Testing set"], - loc=(1.02, 0.8), + title="Data Splits", + loc="upper right", + fontsize=13, + ) + + ax.add_artist(legend_splits) + + group_labels = [f"{group}" for group in np.unique(groups)] + cmap = plt.cm.get_cmap("tab20", len(group_labels)) + + unique_patches = {} + for i, group in enumerate(np.unique(groups)): + unique_patches[group] = Patch(color=cmap(i), label=f"{group}") + + ax.legend( + handles=list(unique_patches.values()), + title="Groups", + loc="center left", + bbox_to_anchor=(1.02, 0.5), fontsize=13, ) ax.set_title("{}\n{}".format(type(cv).__name__, cv_args), fontsize=15) - ax.xaxis.set_major_locator(MaxNLocator(min_n_ticks=len(X), integer=True)) + ax.set_xlim(0, len(X)) + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.set_xlabel(xlabel="Sample index", fontsize=13) ax.set_ylabel(ylabel="CV iteration", fontsize=13) ax.tick_params(axis="both", which="major", labelsize=13)