From 9332173683d717ac1fea3175c42798c97d173031 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Tue, 21 Mar 2023 13:24:46 +0300 Subject: [PATCH 01/10] Hmine implementation and example --- .../user_guide/frequent_patterns/hmine.ipynb | 955 ++++++++++++++++++ mlxtend/frequent_patterns/__init__.py | 3 +- mlxtend/frequent_patterns/hmine.py | 253 +++++ mlxtend/frequent_patterns/tests/test_hmine.py | 91 ++ 4 files changed, 1301 insertions(+), 1 deletion(-) create mode 100644 docs/sources/user_guide/frequent_patterns/hmine.ipynb create mode 100644 mlxtend/frequent_patterns/hmine.py create mode 100644 mlxtend/frequent_patterns/tests/test_hmine.py diff --git a/docs/sources/user_guide/frequent_patterns/hmine.ipynb b/docs/sources/user_guide/frequent_patterns/hmine.ipynb new file mode 100644 index 000000000..8218352ab --- /dev/null +++ b/docs/sources/user_guide/frequent_patterns/hmine.ipynb @@ -0,0 +1,955 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## hmine: Frequent itemsets via the Hmine algorithm" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hmine function to extract frequent itemsets for association rule mining" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> from mlxtend.frequent_patterns import hmine" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H-mine [1] (memory-based hyperstructure mining of frequent patterns) is developed the method is extended to handle large and or dense databases." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H-struct, and a new mining algorithm, **H-mine** , which takes advantage of this data structure and dynamically adjusts links in the mining process.\n", + "A distinct feature of this method is that it has very limited and precisely predictable space overhead and runs really fast in memory-based setting. Moreover, it can be scaled up\n", + "to very large databases by database partitioning, and when the data set becomes dense, (conditional) FP-trees can be constructed dynamically as part of the mining process." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[1] Pei J, Han J, Lu H, Nishio S, Tang S and Yang D, \"[H-Mine: Fast and space-preserving frequent pattern mining in large databases.](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=bcde042283427e23094f9d4d2b765771db5aa57f)\" IIE Transactions, Vol. 39, pp. 593–605, 2007." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Related\n", + "- [FP-Growth](./fpgrowth.md)\n", + "- [FP-Max](./fpmax.md)\n", + "- [Apriori](./apriori.md)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1 -- Generating Frequent Itemsets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `hmine` function expects data in a one-hot encoded pandas DataFrame.\n", + "Suppose we have the following transaction data:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],\n", + " ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],\n", + " ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],\n", + " ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],\n", + " ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can transform it into the right format via the `TransactionEncoder` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AppleCornDillEggsIce creamKidney BeansMilkNutmegOnionUnicornYogurt
0FalseFalseFalseTrueFalseTrueTrueTrueTrueFalseTrue
1FalseFalseTrueTrueFalseTrueFalseTrueTrueFalseTrue
2TrueFalseFalseTrueFalseTrueTrueFalseFalseFalseFalse
3FalseTrueFalseFalseFalseTrueTrueFalseFalseTrueTrue
4FalseTrueFalseTrueTrueTrueFalseFalseTrueFalseFalse
\n", + "
" + ], + "text/plain": [ + " Apple Corn Dill Eggs Ice cream Kidney Beans Milk Nutmeg Onion \\\n", + "0 False False False True False True True True True \n", + "1 False False True True False True False True True \n", + "2 True False False True False True True False False \n", + "3 False True False False False True True False False \n", + "4 False True False True True True False False True \n", + "\n", + " Unicorn Yogurt \n", + "0 False True \n", + "1 False True \n", + "2 False False \n", + "3 True True \n", + "4 False False " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from mlxtend.preprocessing import TransactionEncoder\n", + "\n", + "te = TransactionEncoder()\n", + "te_ary = te.fit(dataset).transform(dataset)\n", + "df = pd.DataFrame(te_ary, columns=te.columns_)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let us return the items and itemsets with at least 60% support:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsets
00.8(3)
10.8(3, 5)
20.6(8, 3, 5)
30.6(8, 3)
41.0(5)
50.6(5, 6)
60.6(8, 5)
70.6(10, 5)
80.6(6)
90.6(8)
100.6(10)
\n", + "
" + ], + "text/plain": [ + " support itemsets\n", + "0 0.8 (3)\n", + "1 0.8 (3, 5)\n", + "2 0.6 (8, 3, 5)\n", + "3 0.6 (8, 3)\n", + "4 1.0 (5)\n", + "5 0.6 (5, 6)\n", + "6 0.6 (8, 5)\n", + "7 0.6 (10, 5)\n", + "8 0.6 (6)\n", + "9 0.6 (8)\n", + "10 0.6 (10)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from mlxtend.frequent_patterns import hmine\n", + "\n", + "hmine(df, min_support=0.6)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, `hmine` returns the column indices of the items, which may be useful in downstream operations such as association rule mining. For better readability, we can set `use_colnames=True` to convert these integer values into the respective item names: " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsets
00.8(Eggs)
10.8(Kidney Beans, Eggs)
20.6(Kidney Beans, Onion, Eggs)
30.6(Onion, Eggs)
41.0(Kidney Beans)
50.6(Kidney Beans, Milk)
60.6(Kidney Beans, Onion)
70.6(Kidney Beans, Yogurt)
80.6(Milk)
90.6(Onion)
100.6(Yogurt)
\n", + "
" + ], + "text/plain": [ + " support itemsets\n", + "0 0.8 (Eggs)\n", + "1 0.8 (Kidney Beans, Eggs)\n", + "2 0.6 (Kidney Beans, Onion, Eggs)\n", + "3 0.6 (Onion, Eggs)\n", + "4 1.0 (Kidney Beans)\n", + "5 0.6 (Kidney Beans, Milk)\n", + "6 0.6 (Kidney Beans, Onion)\n", + "7 0.6 (Kidney Beans, Yogurt)\n", + "8 0.6 (Milk)\n", + "9 0.6 (Onion)\n", + "10 0.6 (Yogurt)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hmine(df, min_support=0.6, use_colnames=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2 -- Hmine versus Apriori and FPGrowth" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the `hmine` algorithm is a memory-based algorithm, it can be magnitudes faster than the alternative Apriori algorithm for large datasets. However, it can be much slower than the FpGrowth algorithm. In the following example, we compare the performance of `hmine` with the `apriori` and `fpgrowth` algorithms on a large dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from mlxtend.preprocessing import TransactionEncoder\n", + "\n", + "te = TransactionEncoder()\n", + "te_ary = te.fit(dataset).transform(dataset)\n", + "df = pd.DataFrame(te_ary, columns=te.columns_)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.32 ms ± 184 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "from mlxtend.frequent_patterns import apriori\n", + "\n", + "%timeit -n 100 -r 10 apriori(df, min_support=0.6, use_colnames=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.37 ms ± 37.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%timeit -n 100 -r 10 apriori(df, min_support=0.6, use_colnames=True, low_memory=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "891 µs ± 20.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "from mlxtend.frequent_patterns import fpgrowth\n", + "\n", + "%timeit -n 100 -r 10 fpgrowth(df, min_support=0.6, use_colnames=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.26 ms ± 94.9 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "from mlxtend.frequent_patterns import hmine\n", + "\n", + "%timeit -n 100 -r 10 hmine(df, min_support=0.6, use_colnames=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3 -- Working with Sparse Representations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save memory, you may want to represent your transaction data in the sparse format.\n", + "This is especially useful if you have lots of products and small transactions." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AppleCornDillEggsIce creamKidney BeansMilkNutmegOnionUnicornYogurt
000010True11101
100110True01101
210010True10000
301000True10011
401011True00100
\n", + "
" + ], + "text/plain": [ + " Apple Corn Dill Eggs Ice cream Kidney Beans Milk Nutmeg Onion \\\n", + "0 0 0 0 1 0 True 1 1 1 \n", + "1 0 0 1 1 0 True 0 1 1 \n", + "2 1 0 0 1 0 True 1 0 0 \n", + "3 0 1 0 0 0 True 1 0 0 \n", + "4 0 1 0 1 1 True 0 0 1 \n", + "\n", + " Unicorn Yogurt \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 0 \n", + "3 1 1 \n", + "4 0 0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oht_ary = te.fit(dataset).transform(dataset, sparse=True)\n", + "sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)\n", + "sparse_df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 itemset(s) from the suffixes on item(s) (Eggs)\n", + "1 itemset(s) from the suffixes on item(s) (Eggs, Kidney Beans)\n", + "0 itemset(s) from the suffixes on item(s) (Eggs, Kidney Beans, Onion)\n", + "0 itemset(s) from the suffixes on item(s) (Eggs, Onion)\n", + "3 itemset(s) from the suffixes on item(s) (Kidney Beans)\n", + "0 itemset(s) from the suffixes on item(s) (Kidney Beans, Milk)\n", + "0 itemset(s) from the suffixes on item(s) (Kidney Beans, Onion)\n", + "0 itemset(s) from the suffixes on item(s) (Kidney Beans, Yogurt)\n", + "0 itemset(s) from the suffixes on item(s) (Milk)\n", + "0 itemset(s) from the suffixes on item(s) (Onion)\n", + "0 itemset(s) from the suffixes on item(s) (Yogurt)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsets
00.8(Eggs)
10.8(Kidney Beans, Eggs)
20.6(Kidney Beans, Onion, Eggs)
30.6(Onion, Eggs)
41.0(Kidney Beans)
50.6(Kidney Beans, Milk)
60.6(Kidney Beans, Onion)
70.6(Kidney Beans, Yogurt)
80.6(Milk)
90.6(Onion)
100.6(Yogurt)
\n", + "
" + ], + "text/plain": [ + " support itemsets\n", + "0 0.8 (Eggs)\n", + "1 0.8 (Kidney Beans, Eggs)\n", + "2 0.6 (Kidney Beans, Onion, Eggs)\n", + "3 0.6 (Onion, Eggs)\n", + "4 1.0 (Kidney Beans)\n", + "5 0.6 (Kidney Beans, Milk)\n", + "6 0.6 (Kidney Beans, Onion)\n", + "7 0.6 (Kidney Beans, Yogurt)\n", + "8 0.6 (Milk)\n", + "9 0.6 (Onion)\n", + "10 0.6 (Yogurt)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hmine(sparse_df, min_support=0.6, use_colnames=True, verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More Examples" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please note that since the `hmine` function is a drop-in replacement for `apriori`, it comes with the same set of function arguments and return arguments. Thus, for more examples, please see the [`apriori`](./apriori.md) documentation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../../api_modules/mlxtend.frequent_patterns/hmine.md', 'r') as f:\n", + " print(f.read())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mlxtend/frequent_patterns/__init__.py b/mlxtend/frequent_patterns/__init__.py index 834de8994..08b200196 100644 --- a/mlxtend/frequent_patterns/__init__.py +++ b/mlxtend/frequent_patterns/__init__.py @@ -8,5 +8,6 @@ from .association_rules import association_rules from .fpgrowth import fpgrowth from .fpmax import fpmax +from .hmine import hmine -__all__ = ["apriori", "association_rules", "fpgrowth", "fpmax"] +__all__ = ["apriori", "association_rules", "fpgrowth", "fpmax", "hmine"] diff --git a/mlxtend/frequent_patterns/hmine.py b/mlxtend/frequent_patterns/hmine.py new file mode 100644 index 000000000..a7b3ba009 --- /dev/null +++ b/mlxtend/frequent_patterns/hmine.py @@ -0,0 +1,253 @@ +# mlxtend Machine Learning Library Extensions +# Author: Fatih Sen +# +# License: BSD 3 clause + +import numpy as np +import pandas as pd +import math + +from ..frequent_patterns import fpcommon as fpc + +def hmine( + df, + min_support=0.5, + use_colnames=False, + max_len=None, + verbose=0 +) -> pd.DataFrame: + + """ + Get frequent itemsets from a one-hot DataFrame + + Parameters + ----------- + df : pandas DataFrame + pandas DataFrame the encoded format. Also supports + DataFrames with sparse data; for more info, please + see https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#sparse-data-structures. + + Please note that the old pandas SparseDataFrame format + is no longer supported in mlxtend >= 0.17.2. + + The allowed values are either 0/1 or True/False. + For example, + + ``` + Apple Bananas Beer Chicken Milk Rice + 0 True False True True False True + 1 True False True False False True + 2 True False True False False False + 3 True True False False False False + 4 False False True True True True + 5 False False True False True True + 6 False False True False True False + 7 True True False False False False + ``` + + min_support : float (default: 0.5) + A float between 0 and 1 for minimum support of the itemsets returned. + The support is computed as the fraction + transactions_where_item(s)_occur / total_transactions. + + use_colnames : bool (default: False) + If true, uses the DataFrames' column names in the returned DataFrame + instead of column indices. + + max_len : int (default: None) + Maximum length of the itemsets generated. If `None` (default) all + possible itemsets lengths are evaluated. + + verbose : int (default: 0) + Shows the stages of conditional tree generation. + + Returns + ----------- + pandas DataFrame with columns ['support', 'itemsets'] of all itemsets + that are >= `min_support` and < than `max_len` + (if `max_len` is not None). + Each itemset in the 'itemsets' column is of type `frozenset`, + which is a Python built-in type that behaves similarly to + sets except that it is immutable + (For more info, see + https://docs.python.org/3.6/library/stdtypes.html#frozenset). + + Examples + ---------- + For usage examples, please see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/ + + """ + + fpc.valid_input_check(df) + if min_support <= 0.0: + raise ValueError( + "`min_support` must be a positive " + "number within the interval `(0, 1]`. " + "Got %s." % min_support + ) + # Calculate the minimum support based on the number of transactions(Abs. support) + minsup = math.ceil(min_support * len(df)) + + is_sparse = False + if hasattr(df, "sparse"): + # DataFrame with SparseArray (pandas >= 0.24) + if df.size == 0: + itemsets = df.values + else: + itemsets = df.sparse.to_coo().tocsr() + is_sparse = True + else: + # dense DataFrame + itemsets = df.values + + single_items = np.array(df.columns) + itemsets_shape = itemsets.shape[0] + itemsets, single_items, single_items_support = itemset_optimisation( + itemsets, + single_items, + minsup) + numeric_single_items = np.arange(len(single_items)) + frequent_itemsets = {} + for item in numeric_single_items: + if single_items_support[item] >= minsup: + supp = single_items_support[item] / itemsets_shape + frequent_itemsets[frozenset([single_items[item]])] = supp + + if max_len == 1: + continue + # Recursive call to find frequent itemsets + frequent_itemsets = hmine_driver( + [item], + itemsets, + minsup, + itemsets_shape, + max_len, + verbose, + single_items, + frequent_itemsets) + + res_df = pd.DataFrame([frequent_itemsets.values(), frequent_itemsets.keys()]).T + res_df.columns = ["support", "itemsets"] + + if not use_colnames: + mapping = {item: idx for idx, item in enumerate(df.columns)} + res_df["itemsets"] = res_df["itemsets"].apply(lambda x: frozenset([mapping[i] for i in x])) + + return res_df + +def itemset_optimisation( + itemsets:np.array, + single_items: np.array, + minsup:int +) -> tuple: + + """ + Downward-closure property of H-Mine algorithm. + Optimises the itemsets matrix by removing items that do not + meet the minimum support(For more info, see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/). + + Args: + itemsets (np.array): matrix of bools or binary + single_items (np.array): array of single items + minsup (int): minimum absolute support + + Returns: + itemsets (np.array): reduced itemsets matrix of bools or binary + single_items (np.array): reduced array of single items + single_items_support (np.array): reduced single items support + """ + + single_items_support = np.array(np.sum(itemsets, axis=0)).reshape(-1) + items = np.nonzero(single_items_support >= minsup)[0] + itemsets = itemsets[:, items] + single_items = single_items[items] + single_items_support = single_items_support[items] + + return itemsets, single_items, single_items_support + +def hmine_driver( + item:list, + itemsets:np.array, + minsup:int, + itemsets_shape:int, + max_len:int, + verbose:int, + single_items:np.array, + frequent_itemsets:dict +) -> dict: + + """ + Driver function for the hmine algorithm. + Recursively generates frequent itemsets. + Also works for sparse matrix. + egg: item = [1] -> [1,2] -> [1,2,3] -> [1,2,4] -> [1,2,5] + For more info, see:( + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) + + Args: + item (list): list of items + itemsets (np.array): matrix of bools or binary + minsup (int): minimum absolute support + itemsets_shape (int): number of transactions + single_items (np.array): array of single items + max_len (int): maximum length of frequent itemsets + verbose (int): verbose mode + frequent_itemsets (dict): dictionary of frequent itemsets + + Returns: + frequent_itemsets(dict): dictionary of frequent itemsets + """ + # Early stopping if the length of the item is greater than max_len + if max_len and len(item) >= max_len: + return frequent_itemsets + + projected_itemsets = create_projected_itemsets( + item, + itemsets) + initial_supports = np.array(np.sum(projected_itemsets,axis = 0)).reshape(-1) + suffixes = np.nonzero(initial_supports >= minsup)[0] + suffixes = suffixes[np.nonzero(suffixes > item[-1])[0]] + + if verbose: + print(f"{len(suffixes)} itemset(s) from the suffixes on item(s) (%s)" % (", ".join(single_items[item]))) + + for suffix in suffixes: + new_item = item.copy() + new_item.append(suffix) + supp = initial_supports[suffix] / itemsets_shape + frequent_itemsets[frozenset(single_items[new_item])] = supp + # Recursive call to find frequent itemsets + frequent_itemsets = hmine_driver( + new_item, + projected_itemsets, + minsup, + itemsets_shape, + max_len, + verbose, + single_items, + frequent_itemsets) + + return frequent_itemsets + +def create_projected_itemsets( + item:list, + itemsets:np.array +) -> np.array: + + """ + Creates the projected itemsets for the given item.(For more info, see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) + + Args: + item (list): list of items + itemsets (np.array): matrix of bools or binary + + Returns: + projected_itemsets(np.array): projected itemsets for the given item + """ + + indices = np.nonzero(np.sum(itemsets[:,item],axis = 1) == len(item))[0] + projected_itemsets = itemsets[indices,:] + return projected_itemsets \ No newline at end of file diff --git a/mlxtend/frequent_patterns/tests/test_hmine.py b/mlxtend/frequent_patterns/tests/test_hmine.py new file mode 100644 index 000000000..cde7d13bc --- /dev/null +++ b/mlxtend/frequent_patterns/tests/test_hmine.py @@ -0,0 +1,91 @@ +# Sebastian Raschka 2014-2022 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +import unittest + +import numpy as np +from test_fpbase import ( + FPTestEdgeCases, + FPTestErrors, + FPTestEx1All, + FPTestEx2All, + FPTestEx3All, +) + +from mlxtend.frequent_patterns import hmine + +class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): + def setUp(self): + FPTestEdgeCases.setUp(self, hmine) + + +class TestErrors(unittest.TestCase, FPTestErrors): + def setUp(self): + FPTestErrors.setUp(self, hmine) + + +class TestHmine(unittest.TestCase, FPTestEx1All): + def setUp(self): + FPTestEx1All.setUp(self, hmine) + +class TestHmineBoolInput(unittest.TestCase, FPTestEx1All): + def setUp(self): + one_ary = np.array( + [ + [False, False, False, True, False, True, True, True, True, False, True], + [False, False, True, True, False, True, False, True, True, False, True], + [ + True, + False, + False, + True, + False, + True, + True, + False, + False, + False, + False, + ], + [ + False, + True, + False, + False, + False, + True, + True, + False, + False, + True, + True, + ], + [ + False, + True, + False, + True, + True, + True, + False, + False, + True, + False, + False, + ], + ] + ) + FPTestEx1All.setUp(self, hmine, one_ary=one_ary) + + +class TestEx2(unittest.TestCase, FPTestEx2All): + def setUp(self): + FPTestEx2All.setUp(self, hmine) + + +class TestEx3(unittest.TestCase, FPTestEx3All): + def setUp(self): + FPTestEx3All.setUp(self, hmine) From 7897c259f922e4ae4747d7b648b0cee5ae0bfc65 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Tue, 21 Mar 2023 20:10:59 +0300 Subject: [PATCH 02/10] Added new features. --- docs/sources/CHANGELOG.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index a9bcce623..b03ee4527 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -931,4 +931,11 @@ imput arrays via `transform` and `fit_transform` ### Version 0.1.1 (2014-08-13) -- Simplified code for ColumnSelector. \ No newline at end of file +- Simplified code for ColumnSelector. + +##### New Features + +- Added `mlxtend.frequent_patterns.hmine` for mining frequent itemsets using the H-Mine algorithm. +- Added `mlxtend.frequent_patterns.test.hmine` for testing the H-Mine algorithm. +- Imported hmine for `mlexxtend.frequent_patterns.__init__` and added the `__all__` list. +- Added `docs.sources.user_guide.frequent_patterns.hmine` for the H-Mine user guide. From 0989684bc11b8fdaad4fb9a12bb47e2055c5fdf3 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Tue, 21 Mar 2023 20:58:45 +0300 Subject: [PATCH 03/10] flake8 control. --- .../user_guide/frequent_patterns/hmine.ipynb | 48 ++++----- mlxtend/frequent_patterns/hmine.py | 100 +++++++----------- mlxtend/frequent_patterns/tests/test_hmine.py | 2 + 3 files changed, 67 insertions(+), 83 deletions(-) diff --git a/docs/sources/user_guide/frequent_patterns/hmine.ipynb b/docs/sources/user_guide/frequent_patterns/hmine.ipynb index 8218352ab..f6300bc2e 100644 --- a/docs/sources/user_guide/frequent_patterns/hmine.ipynb +++ b/docs/sources/user_guide/frequent_patterns/hmine.ipynb @@ -429,17 +429,17 @@ " \n", " 1\n", " 0.8\n", - " (Kidney Beans, Eggs)\n", + " (Eggs, Kidney Beans)\n", " \n", " \n", " 2\n", " 0.6\n", - " (Kidney Beans, Onion, Eggs)\n", + " (Eggs, Kidney Beans, Onion)\n", " \n", " \n", " 3\n", " 0.6\n", - " (Onion, Eggs)\n", + " (Eggs, Onion)\n", " \n", " \n", " 4\n", @@ -449,7 +449,7 @@ " \n", " 5\n", " 0.6\n", - " (Kidney Beans, Milk)\n", + " (Milk, Kidney Beans)\n", " \n", " \n", " 6\n", @@ -459,7 +459,7 @@ " \n", " 7\n", " 0.6\n", - " (Kidney Beans, Yogurt)\n", + " (Yogurt, Kidney Beans)\n", " \n", " \n", " 8\n", @@ -483,13 +483,13 @@ "text/plain": [ " support itemsets\n", "0 0.8 (Eggs)\n", - "1 0.8 (Kidney Beans, Eggs)\n", - "2 0.6 (Kidney Beans, Onion, Eggs)\n", - "3 0.6 (Onion, Eggs)\n", + "1 0.8 (Eggs, Kidney Beans)\n", + "2 0.6 (Eggs, Kidney Beans, Onion)\n", + "3 0.6 (Eggs, Onion)\n", "4 1.0 (Kidney Beans)\n", - "5 0.6 (Kidney Beans, Milk)\n", + "5 0.6 (Milk, Kidney Beans)\n", "6 0.6 (Kidney Beans, Onion)\n", - "7 0.6 (Kidney Beans, Yogurt)\n", + "7 0.6 (Yogurt, Kidney Beans)\n", "8 0.6 (Milk)\n", "9 0.6 (Onion)\n", "10 0.6 (Yogurt)" @@ -543,7 +543,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.32 ms ± 184 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + "3.41 ms ± 584 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" ] } ], @@ -562,7 +562,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.37 ms ± 37.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + "3.36 ms ± 404 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" ] } ], @@ -579,7 +579,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "891 µs ± 20.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + "1.18 ms ± 76.7 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" ] } ], @@ -598,7 +598,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.26 ms ± 94.9 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" + "1.44 ms ± 94.8 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)\n" ] } ], @@ -822,17 +822,17 @@ " \n", " 1\n", " 0.8\n", - " (Kidney Beans, Eggs)\n", + " (Eggs, Kidney Beans)\n", " \n", " \n", " 2\n", " 0.6\n", - " (Kidney Beans, Onion, Eggs)\n", + " (Eggs, Kidney Beans, Onion)\n", " \n", " \n", " 3\n", " 0.6\n", - " (Onion, Eggs)\n", + " (Eggs, Onion)\n", " \n", " \n", " 4\n", @@ -842,7 +842,7 @@ " \n", " 5\n", " 0.6\n", - " (Kidney Beans, Milk)\n", + " (Milk, Kidney Beans)\n", " \n", " \n", " 6\n", @@ -852,7 +852,7 @@ " \n", " 7\n", " 0.6\n", - " (Kidney Beans, Yogurt)\n", + " (Yogurt, Kidney Beans)\n", " \n", " \n", " 8\n", @@ -876,13 +876,13 @@ "text/plain": [ " support itemsets\n", "0 0.8 (Eggs)\n", - "1 0.8 (Kidney Beans, Eggs)\n", - "2 0.6 (Kidney Beans, Onion, Eggs)\n", - "3 0.6 (Onion, Eggs)\n", + "1 0.8 (Eggs, Kidney Beans)\n", + "2 0.6 (Eggs, Kidney Beans, Onion)\n", + "3 0.6 (Eggs, Onion)\n", "4 1.0 (Kidney Beans)\n", - "5 0.6 (Kidney Beans, Milk)\n", + "5 0.6 (Milk, Kidney Beans)\n", "6 0.6 (Kidney Beans, Onion)\n", - "7 0.6 (Kidney Beans, Yogurt)\n", + "7 0.6 (Yogurt, Kidney Beans)\n", "8 0.6 (Milk)\n", "9 0.6 (Onion)\n", "10 0.6 (Yogurt)" diff --git a/mlxtend/frequent_patterns/hmine.py b/mlxtend/frequent_patterns/hmine.py index a7b3ba009..4e072dd2a 100644 --- a/mlxtend/frequent_patterns/hmine.py +++ b/mlxtend/frequent_patterns/hmine.py @@ -9,14 +9,14 @@ from ..frequent_patterns import fpcommon as fpc + def hmine( - df, - min_support=0.5, - use_colnames=False, - max_len=None, - verbose=0 + df, + min_support=0.5, + use_colnames=False, + max_len=None, + verbose=0 ) -> pd.DataFrame: - """ Get frequent itemsets from a one-hot DataFrame @@ -100,33 +100,24 @@ def hmine( else: # dense DataFrame itemsets = df.values - + if is_sparse: + is_sparse single_items = np.array(df.columns) itemsets_shape = itemsets.shape[0] - itemsets, single_items, single_items_support = itemset_optimisation( - itemsets, - single_items, - minsup) + itemsets, single_items, single_items_support = itemset_optimisation(itemsets, single_items, minsup) numeric_single_items = np.arange(len(single_items)) frequent_itemsets = {} for item in numeric_single_items: if single_items_support[item] >= minsup: supp = single_items_support[item] / itemsets_shape frequent_itemsets[frozenset([single_items[item]])] = supp - if max_len == 1: continue - # Recursive call to find frequent itemsets - frequent_itemsets = hmine_driver( - [item], - itemsets, - minsup, - itemsets_shape, - max_len, - verbose, - single_items, - frequent_itemsets) - + # Recursive call to find frequent itemsets + frequent_itemsets = hmine_driver([item], itemsets, minsup, + itemsets_shape, max_len, verbose, + single_items, frequent_itemsets) + # Convert the dictionary to a DataFrame res_df = pd.DataFrame([frequent_itemsets.values(), frequent_itemsets.keys()]).T res_df.columns = ["support", "itemsets"] @@ -136,17 +127,18 @@ def hmine( return res_df + def itemset_optimisation( - itemsets:np.array, - single_items: np.array, - minsup:int + itemsets: np.array, + single_items: np.array, + minsup: int, ) -> tuple: """ Downward-closure property of H-Mine algorithm. - Optimises the itemsets matrix by removing items that do not - meet the minimum support(For more info, see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/). + Optimises the itemsets matrix by removing items that do not + meet the minimum support(For more info, see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/). Args: itemsets (np.array): matrix of bools or binary @@ -167,15 +159,16 @@ def itemset_optimisation( return itemsets, single_items, single_items_support + def hmine_driver( - item:list, - itemsets:np.array, - minsup:int, - itemsets_shape:int, - max_len:int, - verbose:int, - single_items:np.array, - frequent_itemsets:dict + item: list, + itemsets: np.array, + minsup: int, + itemsets_shape: int, + max_len: int, + verbose: int, + single_items: np.array, + frequent_itemsets: dict ) -> dict: """ @@ -202,40 +195,29 @@ def hmine_driver( # Early stopping if the length of the item is greater than max_len if max_len and len(item) >= max_len: return frequent_itemsets - - projected_itemsets = create_projected_itemsets( - item, - itemsets) - initial_supports = np.array(np.sum(projected_itemsets,axis = 0)).reshape(-1) + projected_itemsets = create_projected_itemsets(item, itemsets) + initial_supports = np.array(np.sum(projected_itemsets, axis=0)).reshape(-1) suffixes = np.nonzero(initial_supports >= minsup)[0] suffixes = suffixes[np.nonzero(suffixes > item[-1])[0]] if verbose: print(f"{len(suffixes)} itemset(s) from the suffixes on item(s) (%s)" % (", ".join(single_items[item]))) - for suffix in suffixes: new_item = item.copy() new_item.append(suffix) supp = initial_supports[suffix] / itemsets_shape frequent_itemsets[frozenset(single_items[new_item])] = supp # Recursive call to find frequent itemsets - frequent_itemsets = hmine_driver( - new_item, - projected_itemsets, - minsup, - itemsets_shape, - max_len, - verbose, - single_items, - frequent_itemsets) - + frequent_itemsets = hmine_driver(new_item, projected_itemsets, minsup, + itemsets_shape, max_len, verbose, + single_items, frequent_itemsets) return frequent_itemsets + def create_projected_itemsets( - item:list, - itemsets:np.array + item: list, + itemsets: np.array ) -> np.array: - """ Creates the projected itemsets for the given item.(For more info, see http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) @@ -248,6 +230,6 @@ def create_projected_itemsets( projected_itemsets(np.array): projected itemsets for the given item """ - indices = np.nonzero(np.sum(itemsets[:,item],axis = 1) == len(item))[0] - projected_itemsets = itemsets[indices,:] - return projected_itemsets \ No newline at end of file + indices = np.nonzero(np.sum(itemsets[:, item], axis=1) == len(item))[0] + projected_itemsets = itemsets[indices, :] + return projected_itemsets diff --git a/mlxtend/frequent_patterns/tests/test_hmine.py b/mlxtend/frequent_patterns/tests/test_hmine.py index cde7d13bc..319d9c9d2 100644 --- a/mlxtend/frequent_patterns/tests/test_hmine.py +++ b/mlxtend/frequent_patterns/tests/test_hmine.py @@ -17,6 +17,7 @@ from mlxtend.frequent_patterns import hmine + class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): def setUp(self): FPTestEdgeCases.setUp(self, hmine) @@ -31,6 +32,7 @@ class TestHmine(unittest.TestCase, FPTestEx1All): def setUp(self): FPTestEx1All.setUp(self, hmine) + class TestHmineBoolInput(unittest.TestCase, FPTestEx1All): def setUp(self): one_ary = np.array( From eff85c0ba9fba5a8d87b0520046754ddc48e217c Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Fri, 24 Mar 2023 02:11:29 +0300 Subject: [PATCH 04/10] Added test case showing that algorithms give the same result. --- mlxtend/frequent_patterns/tests/test_hmine.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/mlxtend/frequent_patterns/tests/test_hmine.py b/mlxtend/frequent_patterns/tests/test_hmine.py index 319d9c9d2..90d288256 100644 --- a/mlxtend/frequent_patterns/tests/test_hmine.py +++ b/mlxtend/frequent_patterns/tests/test_hmine.py @@ -6,6 +6,7 @@ import unittest +import pandas as pd import numpy as np from test_fpbase import ( FPTestEdgeCases, @@ -16,6 +17,8 @@ ) from mlxtend.frequent_patterns import hmine +from mlxtend.frequent_patterns import fpgrowth +from mlxtend.frequent_patterns import apriori class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): @@ -91,3 +94,71 @@ def setUp(self): class TestEx3(unittest.TestCase, FPTestEx3All): def setUp(self): FPTestEx3All.setUp(self, hmine) + + +class TestCorrect(unittest.TestCase): + def setUp(self): + self.one_ary = np.array([ + [False, False, False, True, False, True, True, True, True, False, True], + [False, False, True, True, False, True, False, True, True, False, True], + [True, False, False, True, False, True, True, False, False, False, False], + [False, True, False, False, False, True, True, False, False, True, True], + [False, True, False, True, True, True, False, False, True, False, False], + ]) + self.cols = [ + "Apple", + "Corn", + "Dill", + "Eggs", + "Ice cream", + "Kidney Beans", + "Milk", + "Nutmeg", + "Onion", + "Unicorn", + "Yogurt", + ] + self.df = pd.DataFrame(self.one_ary, columns=self.cols) + + def test_compare_correct(self): + + expect = pd.DataFrame( + [ + [0.8, np.array([3])], + [1.0, np.array([5])], + [0.6, np.array([6])], + [0.6, np.array([8])], + [0.6, np.array([10])], + [0.8, np.array([3, 5])], + [0.6, np.array([3, 8])], + [0.6, np.array([5, 6])], + [0.6, np.array([5, 8])], + [0.6, np.array([5, 10])], + [0.6, np.array([3, 5, 8])], + ], + columns=["support", "itemsets"], + ) + algorithms = {hmine: None, fpgrowth: None, apriori: None} + for algo in algorithms.keys(): + self.setUp() + res_df = algo(self.df, min_support=0.6) + compare_df(res_df, expect) + algorithms[algo] = res_df + + compare_df(algorithms[hmine], algorithms[fpgrowth]) + compare_df(algorithms[hmine], algorithms[apriori]) + compare_df(algorithms[fpgrowth], algorithms[apriori]) + + +def compare_df(df1, df2): + itemsets1 = [sorted(list(i)) for i in df1["itemsets"]] + itemsets2 = [sorted(list(i)) for i in df2["itemsets"]] + rows1 = sorted(zip(itemsets1, df1["support"])) + rows2 = sorted(zip(itemsets2, df2["support"])) + for row1, row2 in zip(rows1, rows2): + if row1[0] != row2[0]: + msg = f"Expected different frequent itemsets\nx:{row1[0]}\ny:{row2[0]}" + raise AssertionError(msg) + elif row1[1] != row2[1]: + msg = f"Expected different support\nx:{row1[1]}\ny:{row2[1]}" + raise AssertionError(msg) From a6110144e103cefe7dd77ba6ee50e6ee3714ec67 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Fri, 24 Mar 2023 02:12:29 +0300 Subject: [PATCH 05/10] I corrected the wrong places. --- docs/sources/CHANGELOG.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index b03ee4527..40a7a48a1 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -22,11 +22,15 @@ Version 0.22.0dev (TBD) - When `ExhaustiveFeatureSelector` is run with `n_jobs == 1`, joblib is now disabled, which enables more immediate (live) feedback when the `verbose` mode is enabled. [#985](https://github.com/rasbt/mlxtend/pull/985) via [Nima Sarajpoor] - Disabled unnecessary warning in EnsembleVoteClassifier [#941](https://github.com/rasbt/mlxtend/issues/941) - Fixed various documentation issues [#849] [#951] +- Imported hmine for `mlexxtend.frequent_patterns.__init__` and added the `__all__` list. ##### New Features and Enhancements - The `mlxtend.frequent_patterns.association_rules` function has a new metric - Zhangs Metric, which measures both association and dissociation. ([#980](https://github.com/rasbt/mlxtend/pull/980)) - Internal `fpmax` code improvement that avoids casting a sparse DataFrame into a dense NumPy array. ([#1000](https://github.com/rasbt/mlxtend/pull/1000) via [Tim Kellogg](https://github.com/tkellogg)) +- Added `mlxtend.frequent_patterns.hmine` for mining frequent itemsets using the H-Mine algorithm. +- Added `mlxtend.frequent_patterns.test.hmine` for testing the H-Mine algorithm. +- Added `docs.sources.user_guide.frequent_patterns.hmine` for the H-Mine user guide. ### Version 0.21.0 (09/17/2022) @@ -931,11 +935,4 @@ imput arrays via `transform` and `fit_transform` ### Version 0.1.1 (2014-08-13) -- Simplified code for ColumnSelector. - -##### New Features - -- Added `mlxtend.frequent_patterns.hmine` for mining frequent itemsets using the H-Mine algorithm. -- Added `mlxtend.frequent_patterns.test.hmine` for testing the H-Mine algorithm. -- Imported hmine for `mlexxtend.frequent_patterns.__init__` and added the `__all__` list. -- Added `docs.sources.user_guide.frequent_patterns.hmine` for the H-Mine user guide. +- Simplified code for ColumnSelector. \ No newline at end of file From e38682c15875b4379d92477564bfaac9f71bda81 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Fri, 24 Mar 2023 13:45:53 +0300 Subject: [PATCH 06/10] isort fix. --- mlxtend/frequent_patterns/hmine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlxtend/frequent_patterns/hmine.py b/mlxtend/frequent_patterns/hmine.py index 4e072dd2a..1535e39a7 100644 --- a/mlxtend/frequent_patterns/hmine.py +++ b/mlxtend/frequent_patterns/hmine.py @@ -3,9 +3,10 @@ # # License: BSD 3 clause +import math + import numpy as np import pandas as pd -import math from ..frequent_patterns import fpcommon as fpc From fcf35ed87742ad64a121fae25ed7eb6a692e54c8 Mon Sep 17 00:00:00 2001 From: fatihsen20 Date: Sat, 25 Mar 2023 13:35:34 +0300 Subject: [PATCH 07/10] isort fix. --- mlxtend/frequent_patterns/tests/test_hmine.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mlxtend/frequent_patterns/tests/test_hmine.py b/mlxtend/frequent_patterns/tests/test_hmine.py index 90d288256..445ee00f9 100644 --- a/mlxtend/frequent_patterns/tests/test_hmine.py +++ b/mlxtend/frequent_patterns/tests/test_hmine.py @@ -6,8 +6,8 @@ import unittest -import pandas as pd import numpy as np +import pandas as pd from test_fpbase import ( FPTestEdgeCases, FPTestErrors, @@ -16,9 +16,7 @@ FPTestEx3All, ) -from mlxtend.frequent_patterns import hmine -from mlxtend.frequent_patterns import fpgrowth -from mlxtend.frequent_patterns import apriori +from mlxtend.frequent_patterns import apriori, fpgrowth, hmine class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): From 32f4b3c2f6ba12f1e0d6c48dbc9de7120d54c352 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 25 Mar 2023 12:36:51 -0500 Subject: [PATCH 08/10] update changelog and attribution --- docs/sources/CHANGELOG.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 40a7a48a1..dc1688ac3 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -19,18 +19,15 @@ Version 0.22.0dev (TBD) ##### Changes -- When `ExhaustiveFeatureSelector` is run with `n_jobs == 1`, joblib is now disabled, which enables more immediate (live) feedback when the `verbose` mode is enabled. [#985](https://github.com/rasbt/mlxtend/pull/985) via [Nima Sarajpoor] -- Disabled unnecessary warning in EnsembleVoteClassifier [#941](https://github.com/rasbt/mlxtend/issues/941) -- Fixed various documentation issues [#849] [#951] -- Imported hmine for `mlexxtend.frequent_patterns.__init__` and added the `__all__` list. +- When `ExhaustiveFeatureSelector` is run with `n_jobs == 1`, joblib is now disabled, which enables more immediate (live) feedback when the `verbose` mode is enabled. ([#985](https://github.com/rasbt/mlxtend/pull/985) via [Nima Sarajpoor](https://github.com/NimaSarajpoor)) +- Disabled unnecessary warning in EnsembleVoteClassifier ([#941](https://github.com/rasbt/mlxtend/issues/941)) +- Fixed various documentation issues ([#849](https://github.com/rasbt/mlxtend/issues/849) and [#951](https://github.com/rasbt/mlxtend/issues/951) via [Lekshmanan Natarajan](https://github.com/zuari1993)) ##### New Features and Enhancements - The `mlxtend.frequent_patterns.association_rules` function has a new metric - Zhangs Metric, which measures both association and dissociation. ([#980](https://github.com/rasbt/mlxtend/pull/980)) - Internal `fpmax` code improvement that avoids casting a sparse DataFrame into a dense NumPy array. ([#1000](https://github.com/rasbt/mlxtend/pull/1000) via [Tim Kellogg](https://github.com/tkellogg)) -- Added `mlxtend.frequent_patterns.hmine` for mining frequent itemsets using the H-Mine algorithm. -- Added `mlxtend.frequent_patterns.test.hmine` for testing the H-Mine algorithm. -- Added `docs.sources.user_guide.frequent_patterns.hmine` for the H-Mine user guide. +- Added `mlxtend.frequent_patterns.hmine` algorithm and documentation for mining frequent itemsets using the H-Mine algorithm. ([#1020](https://github.com/rasbt/mlxtend/pull/1020) via [Fatih Sen](https://github.com/fatihsen20)) ### Version 0.21.0 (09/17/2022) From 95bf8a509593d70979318e959b84c890e193c391 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 25 Mar 2023 12:44:06 -0500 Subject: [PATCH 09/10] minor cosmetics --- mlxtend/frequent_patterns/hmine.py | 73 +++++++++++-------- mlxtend/frequent_patterns/tests/test_hmine.py | 56 +++++++++++--- 2 files changed, 88 insertions(+), 41 deletions(-) diff --git a/mlxtend/frequent_patterns/hmine.py b/mlxtend/frequent_patterns/hmine.py index 1535e39a7..2f02aad41 100644 --- a/mlxtend/frequent_patterns/hmine.py +++ b/mlxtend/frequent_patterns/hmine.py @@ -12,11 +12,7 @@ def hmine( - df, - min_support=0.5, - use_colnames=False, - max_len=None, - verbose=0 + df, min_support=0.5, use_colnames=False, max_len=None, verbose=0 ) -> pd.DataFrame: """ Get frequent itemsets from a one-hot DataFrame @@ -87,7 +83,7 @@ def hmine( "number within the interval `(0, 1]`. " "Got %s." % min_support ) - # Calculate the minimum support based on the number of transactions(Abs. support) + # Calculate the minimum support based on the number of transactions (absolute support) minsup = math.ceil(min_support * len(df)) is_sparse = False @@ -105,7 +101,9 @@ def hmine( is_sparse single_items = np.array(df.columns) itemsets_shape = itemsets.shape[0] - itemsets, single_items, single_items_support = itemset_optimisation(itemsets, single_items, minsup) + itemsets, single_items, single_items_support = itemset_optimisation( + itemsets, single_items, minsup + ) numeric_single_items = np.arange(len(single_items)) frequent_itemsets = {} for item in numeric_single_items: @@ -115,31 +113,39 @@ def hmine( if max_len == 1: continue # Recursive call to find frequent itemsets - frequent_itemsets = hmine_driver([item], itemsets, minsup, - itemsets_shape, max_len, verbose, - single_items, frequent_itemsets) + frequent_itemsets = hmine_driver( + [item], + itemsets, + minsup, + itemsets_shape, + max_len, + verbose, + single_items, + frequent_itemsets, + ) # Convert the dictionary to a DataFrame res_df = pd.DataFrame([frequent_itemsets.values(), frequent_itemsets.keys()]).T res_df.columns = ["support", "itemsets"] if not use_colnames: mapping = {item: idx for idx, item in enumerate(df.columns)} - res_df["itemsets"] = res_df["itemsets"].apply(lambda x: frozenset([mapping[i] for i in x])) + res_df["itemsets"] = res_df["itemsets"].apply( + lambda x: frozenset([mapping[i] for i in x]) + ) return res_df def itemset_optimisation( - itemsets: np.array, - single_items: np.array, - minsup: int, + itemsets: np.array, + single_items: np.array, + minsup: int, ) -> tuple: - """ Downward-closure property of H-Mine algorithm. - Optimises the itemsets matrix by removing items that do not - meet the minimum support(For more info, see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/). + Optimizes the itemsets matrix by removing items that do not + meet the minimum support. (For more info, see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) Args: itemsets (np.array): matrix of bools or binary @@ -169,16 +175,15 @@ def hmine_driver( max_len: int, verbose: int, single_items: np.array, - frequent_itemsets: dict + frequent_itemsets: dict, ) -> dict: - """ Driver function for the hmine algorithm. Recursively generates frequent itemsets. Also works for sparse matrix. egg: item = [1] -> [1,2] -> [1,2,3] -> [1,2,4] -> [1,2,5] - For more info, see:( - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) + For more info, see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/ Args: item (list): list of items @@ -202,25 +207,31 @@ def hmine_driver( suffixes = suffixes[np.nonzero(suffixes > item[-1])[0]] if verbose: - print(f"{len(suffixes)} itemset(s) from the suffixes on item(s) (%s)" % (", ".join(single_items[item]))) + print( + f"{len(suffixes)} itemset(s) from the suffixes on item(s) ({', '.join(single_items[item])})" + ) for suffix in suffixes: new_item = item.copy() new_item.append(suffix) supp = initial_supports[suffix] / itemsets_shape frequent_itemsets[frozenset(single_items[new_item])] = supp # Recursive call to find frequent itemsets - frequent_itemsets = hmine_driver(new_item, projected_itemsets, minsup, - itemsets_shape, max_len, verbose, - single_items, frequent_itemsets) + frequent_itemsets = hmine_driver( + new_item, + projected_itemsets, + minsup, + itemsets_shape, + max_len, + verbose, + single_items, + frequent_itemsets, + ) return frequent_itemsets -def create_projected_itemsets( - item: list, - itemsets: np.array -) -> np.array: +def create_projected_itemsets(item: list, itemsets: np.array) -> np.array: """ - Creates the projected itemsets for the given item.(For more info, see + Creates the projected itemsets for the given item. (For more info, see http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/) Args: diff --git a/mlxtend/frequent_patterns/tests/test_hmine.py b/mlxtend/frequent_patterns/tests/test_hmine.py index 445ee00f9..ef6e3b600 100644 --- a/mlxtend/frequent_patterns/tests/test_hmine.py +++ b/mlxtend/frequent_patterns/tests/test_hmine.py @@ -1,6 +1,5 @@ -# Sebastian Raschka 2014-2022 # mlxtend Machine Learning Library Extensions -# Author: Sebastian Raschka +# Author: Fatih Sen # # License: BSD 3 clause @@ -96,13 +95,51 @@ def setUp(self): class TestCorrect(unittest.TestCase): def setUp(self): - self.one_ary = np.array([ - [False, False, False, True, False, True, True, True, True, False, True], - [False, False, True, True, False, True, False, True, True, False, True], - [True, False, False, True, False, True, True, False, False, False, False], - [False, True, False, False, False, True, True, False, False, True, True], - [False, True, False, True, True, True, False, False, True, False, False], - ]) + self.one_ary = np.array( + [ + [False, False, False, True, False, True, True, True, True, False, True], + [False, False, True, True, False, True, False, True, True, False, True], + [ + True, + False, + False, + True, + False, + True, + True, + False, + False, + False, + False, + ], + [ + False, + True, + False, + False, + False, + True, + True, + False, + False, + True, + True, + ], + [ + False, + True, + False, + True, + True, + True, + False, + False, + True, + False, + False, + ], + ] + ) self.cols = [ "Apple", "Corn", @@ -119,7 +156,6 @@ def setUp(self): self.df = pd.DataFrame(self.one_ary, columns=self.cols) def test_compare_correct(self): - expect = pd.DataFrame( [ [0.8, np.array([3])], From 21e038dd9fc4261eceb4e94912a418cfed5ad976 Mon Sep 17 00:00:00 2001 From: rasbt Date: Mon, 27 Mar 2023 11:16:05 -0500 Subject: [PATCH 10/10] little doc update --- .../user_guide/frequent_patterns/hmine.ipynb | 134 ++++++++++++++---- 1 file changed, 103 insertions(+), 31 deletions(-) diff --git a/docs/sources/user_guide/frequent_patterns/hmine.ipynb b/docs/sources/user_guide/frequent_patterns/hmine.ipynb index f6300bc2e..35de81b08 100644 --- a/docs/sources/user_guide/frequent_patterns/hmine.ipynb +++ b/docs/sources/user_guide/frequent_patterns/hmine.ipynb @@ -1,15 +1,13 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## hmine: Frequent itemsets via the Hmine algorithm" + "## `hmine`: Frequent itemsets via the H-mine algorithm" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -17,7 +15,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -25,7 +22,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -33,25 +29,25 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "H-mine [1] (memory-based hyperstructure mining of frequent patterns) is developed the method is extended to handle large and or dense databases." + "H-mine [1] (memory-based hyperstructure mining of frequent patterns) is a data mining algorithm used for frequent itemset mining -- the process of finding frequently occurring patterns in large transactional datasets. \n", + "\n", + "H-mine is an improvement over the Apriori and FP-Growth algorithms, offering better performance in terms of time and space complexity. It achieves this by using the H-struct data structure and a more efficient search space traversal method." ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "H-struct, and a new mining algorithm, **H-mine** , which takes advantage of this data structure and dynamically adjusts links in the mining process.\n", - "A distinct feature of this method is that it has very limited and precisely predictable space overhead and runs really fast in memory-based setting. Moreover, it can be scaled up\n", + "H-mine improves upon the FP-Growth algorithm by introducing a novel data structure called the H-struct. The H-struct is a hybrid data structure that combines the benefits of both horizontal and vertical data layouts, making it more efficient for frequent itemset mining.\n", + "\n", + "A distinct feature of this method is that it has very limited and precisely predictable space overhead and runs really fast in memory-based settings. Moreover, it can be scaled up\n", "to very large databases by database partitioning, and when the data set becomes dense, (conditional) FP-trees can be constructed dynamically as part of the mining process." ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -59,7 +55,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -67,7 +62,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -78,7 +72,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -86,7 +79,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -383,7 +375,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -505,19 +496,17 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 2 -- Hmine versus Apriori and FPGrowth" + "## Example 2 -- H-Mine versus Apriori and FP-Growth" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Since the `hmine` algorithm is a memory-based algorithm, it can be magnitudes faster than the alternative Apriori algorithm for large datasets. However, it can be much slower than the FpGrowth algorithm. In the following example, we compare the performance of `hmine` with the `apriori` and `fpgrowth` algorithms on a large dataset." + "Since the `hmine` algorithm is a memory-based algorithm, it can be magnitudes faster than the alternative Apriori algorithm for large datasets. However, it can be much slower than the FP-Growth algorithm. In the following example, we compare the performance of `hmine` with the `apriori` and `fpgrowth` algorithms on a small dataset." ] }, { @@ -609,7 +598,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -617,7 +605,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -905,11 +892,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Please note that since the `hmine` function is a drop-in replacement for `apriori`, it comes with the same set of function arguments and return arguments. Thus, for more examples, please see the [`apriori`](./apriori.md) documentation." + "Please note that since the `hmine` function is a drop-in replacement for `apriori` and `fpgrowth`, it comes with the same set of function arguments and return arguments. Thus, for more examples, please see the [`apriori`](./apriori.md) documentation." ] }, { @@ -921,18 +907,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## hmine\n", + "\n", + "*hmine(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0) -> pandas.core.frame.DataFrame*\n", + "\n", + "Get frequent itemsets from a one-hot DataFrame\n", + "\n", + "**Parameters**\n", + "\n", + "- `df` : pandas DataFrame\n", + "\n", + " pandas DataFrame the encoded format. Also supports\n", + " DataFrames with sparse data; for more info, please\n", + " see https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#sparse-data-structures.\n", + "\n", + " Please note that the old pandas SparseDataFrame format\n", + " is no longer supported in mlxtend >= 0.17.2.\n", + "\n", + " The allowed values are either 0/1 or True/False.\n", + " For example,\n", + "\n", + " ```\n", + " Apple Bananas Beer Chicken Milk Rice\n", + " 0 True False True True False True\n", + " 1 True False True False False True\n", + " 2 True False True False False False\n", + " 3 True True False False False False\n", + " 4 False False True True True True\n", + " 5 False False True False True True\n", + " 6 False False True False True False\n", + " 7 True True False False False False\n", + " ```\n", + "\n", + "\n", + "- `min_support` : float (default: 0.5)\n", + "\n", + " A float between 0 and 1 for minimum support of the itemsets returned.\n", + " The support is computed as the fraction\n", + " transactions_where_item(s)_occur / total_transactions.\n", + "\n", + "\n", + "- `use_colnames` : bool (default: False)\n", + "\n", + " If true, uses the DataFrames' column names in the returned DataFrame\n", + " instead of column indices.\n", + "\n", + "\n", + "- `max_len` : int (default: None)\n", + "\n", + " Maximum length of the itemsets generated. If `None` (default) all\n", + " possible itemsets lengths are evaluated.\n", + "\n", + "\n", + "- `verbose` : int (default: 0)\n", + "\n", + " Shows the stages of conditional tree generation.\n", + "\n", + "**Returns**\n", + "\n", + "pandas DataFrame with columns ['support', 'itemsets'] of all itemsets\n", + " that are >= `min_support` and < than `max_len`\n", + " (if `max_len` is not None).\n", + " Each itemset in the 'itemsets' column is of type `frozenset`,\n", + " which is a Python built-in type that behaves similarly to\n", + " sets except that it is immutable\n", + " (For more info, see\n", + " https://docs.python.org/3.6/library/stdtypes.html#frozenset).\n", + "\n", + "**Examples**\n", + "\n", + "For usage examples, please see\n", + " http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/hmine/\n", + "\n", + "\n" + ] + } + ], "source": [ "with open('../../api_modules/mlxtend.frequent_patterns/hmine.md', 'r') as f:\n", " print(f.read())" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -946,10 +1019,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "orig_nbformat": 4 + "version": "3.10.6" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }