From 1e975d94bf09c596acf58380d882b46dc839242b Mon Sep 17 00:00:00 2001 From: hiepnguyen034 Date: Tue, 4 May 2021 17:23:46 +0700 Subject: [PATCH 1/4] Add advanced discretisation strategies (#149) --- RELEASE.md | 5 +- causalnex/discretiser/__init__.py | 10 +- causalnex/discretiser/abstract_discretiser.py | 114 +++++ causalnex/discretiser/discretiser_strategy.py | 300 +++++++++++++ causalnex/utils/__init__.py | 0 causalnex/utils/decision_tree_tools.py | 63 +++ setup.py | 1 + test_requirements.txt | 1 + tests/discretiser/conftest.py | 99 +++++ tests/discretiser/test_base.py | 48 ++ tests/discretiser/test_decision_tree.py | 418 ++++++++++++++++++ tests/discretiser/test_mdlp.py | 108 +++++ 12 files changed, 1164 insertions(+), 3 deletions(-) create mode 100644 causalnex/discretiser/abstract_discretiser.py create mode 100644 causalnex/discretiser/discretiser_strategy.py create mode 100644 causalnex/utils/__init__.py create mode 100644 causalnex/utils/decision_tree_tools.py create mode 100644 tests/discretiser/conftest.py create mode 100644 tests/discretiser/test_base.py create mode 100644 tests/discretiser/test_decision_tree.py create mode 100644 tests/discretiser/test_mdlp.py diff --git a/RELEASE.md b/RELEASE.md index b9f0969..9e8e5f3 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,10 +1,11 @@ # Upcoming release # Release 0.10.0 - * Add utility function to extract Markov blanket from a Bayesian Network * Support receiving a list of inputs for `InferenceEngine` with a multiprocessing option -* Fixes cyclical import of `causalnex.plots`, as per #106. +* Fixes cyclical import of `causalnex.plots`, as per #106 +* Add supervised discretisation strategies using Decision Tree and MDLP algorithms + # Release 0.9.2 * Remove Boston housing dataset from "sklearn tutorial", see #91 for more information. diff --git a/causalnex/discretiser/__init__.py b/causalnex/discretiser/__init__.py index 2dd643b..c4b29bc 100644 --- a/causalnex/discretiser/__init__.py +++ b/causalnex/discretiser/__init__.py @@ -30,6 +30,14 @@ ``causalnex.discretiser`` provides functionality to discretise data. """ -__all__ = ["Discretiser"] +__all__ = [ + "Discretiser", + "DecisionTreeSupervisedDiscretiserMethod", + "MDLPSupervisedDiscretiserMethod", +] from .discretiser import Discretiser +from .discretiser_strategy import ( + DecisionTreeSupervisedDiscretiserMethod, + MDLPSupervisedDiscretiserMethod, +) diff --git a/causalnex/discretiser/abstract_discretiser.py b/causalnex/discretiser/abstract_discretiser.py new file mode 100644 index 0000000..031000a --- /dev/null +++ b/causalnex/discretiser/abstract_discretiser.py @@ -0,0 +1,114 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tools to help discretise data.""" + +import logging +from abc import ABC, abstractmethod +from typing import List + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator + + +class AbstractSupervisedDiscretiserMethod(BaseEstimator, ABC): + """ + Base class for advanced discretisation methods + + """ + + def __init__(self): + self.map_thresholds = {} + self.feat_names = None + + @abstractmethod + def fit( + self, + feat_names: List[str], + target: str, + dataframe: pd.DataFrame, + target_continuous: bool, + ): + """ + Discretise the features in `feat_names` in such a way that maximises the prediction of `target`. + + Args: + feat_names (List[str]): List of feature names to be discretised. + target (str): Name of the target variable - the node that adjusts how `feat_names` will be discretised + dataframe: The full dataset prior to discretisation. + target_continuous (bool): Boolean indicates if target variable is continuous + Raises: + NotImplementedError: AbstractSupervisedDiscretiserMethod should not be called directly + + """ + raise NotImplementedError("The method is not implemented") + + def _transform_one_column(self, dataframe_one_column: pd.DataFrame) -> np.array: + """ + Given one "original" feature (continuous), discretise it. + + Args: + dataframe_one_column: dataframe with a single continuous feature, to be transformed into discrete + Returns: + Discrete feature, as an np.array of shape (len(df),) + """ + cols = list(dataframe_one_column.columns) + if cols[0] in self.map_thresholds: + split_points = self.map_thresholds[cols[0]] + return np.digitize(dataframe_one_column.values.reshape(-1), split_points) + + if cols[0] not in self.feat_names: + logging.warning( + "%s is not in feat_names. The column is left unchanged", cols[0] + ) + return dataframe_one_column.values.reshape(-1) + + def transform(self, data: pd.DataFrame) -> np.array: + """ + Given one "original" dataframe, discretise it. + + Args: + data: dataframe with continuous features, to be transformed into discrete + Returns: + discretised version of the input data + """ + outputs = {} + for col in data.columns: + outputs[col] = self._transform_one_column(data[[col]]) + + transformed_df = pd.DataFrame.from_dict(outputs) + return transformed_df + + def fit_transform(self, *args, **kwargs): + """ + Raises: + NotImplementedError: fit_transform is not implemented + """ + raise NotImplementedError( + "fit_transform is not implemented. Please use .fit() and .transform() separately" + ) diff --git a/causalnex/discretiser/discretiser_strategy.py b/causalnex/discretiser/discretiser_strategy.py new file mode 100644 index 0000000..8431ec1 --- /dev/null +++ b/causalnex/discretiser/discretiser_strategy.py @@ -0,0 +1,300 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tools to help discretise data.""" + +import logging +from copy import deepcopy +from typing import Any, Dict, List + +import pandas as pd +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + +from causalnex.discretiser.abstract_discretiser import ( + AbstractSupervisedDiscretiserMethod, +) +from causalnex.utils.decision_tree_tools import extract_thresholds_from_dtree + +try: + from mdlp.discretization import MDLP +except ImportError: + MDLP = None + logging.warning("MDLP was not imported successfully") + + +class DecisionTreeSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod): + """Allows the discretisation of continuous features based on the split thresholds of either + sklearn's DecisionTreeRegressor or DecisionTreeClassifier. + DecisionTreeSupervisedDiscretiserMethod is inhereited from AbstractSupervisedDiscretiserMethod. + When instantiated, we have an object with .fit method to learn discretisation thresholds from data + and .transform method to process the input. + + + Example: + :: + >>> import pandas as pd + >>> import numpy as np + >>> from causalnex.discretiser.discretiser_strategy import DecisionTreeSupervisedDiscretiserMethod + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> X, y = iris["data"], iris["target"] + >>> names = iris["feature_names"] + >>> data = pd.DataFrame(X, columns=names) + >>> data["target"] = y + >>> dt_multi = DecisionTreeSupervisedDiscretiserMethod( + >>> mode="multi", tree_params={"max_depth": 3, "random_state": 2020} + >>> ) + >>> tree_discretiser = dt_multi.fit( + >>> feat_names=[ + >>> "sepal length (cm)", + >>> "sepal width (cm)", + >>> "petal length (cm)", + >>> "petal width (cm)", + >>> ], + >>> dataframe=data, + >>> target="target", + >>> target_continuous=False, + >>> ) + >>> discretised_data = tree_discretiser.transform(data[["petal width (cm)"]]) + >>> discretised_data.values.ravel() + array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, + 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + + """ + + def __init__( + self, + mode: str = "single", + split_unselected_feat: bool = False, + tree_params: Dict[str, Any] = None, + ): + """ + This Discretiser Method uses Decision Trees to predict the target. + The cutting points on the the Decision Tree becomes the chosen discretisation thresholds + + If the target is a continuous variable, we fit a `DecisionTreeRegressor` to discretise the data. + Otherwise, we fit a Classifier. + + Args: + max_depth (int): maximum depth of the decision tree. + mode (str): Either 'single' or 'multi'. + - if single, Train a univariate decision tree for each continuous variable being discretised. + The splitting points of the decision tree become discretiser fixed points + - if multi, Train a decision tree over all the variables passed. + The splitting points of each variable used in the Decision tree become the thresholds for discretisation + split_unselected_feat (bool): only applicable if self.mode = 'multi'. + - if True, features not selected by the decision tree will be discretised using 'single' mode + with the same tree parameters + - if False, features not selected by the decision tree will be left unchanged + tree_params: keyword arguments, which are parameters + used for `sklearn.tree.DecisionTreeClassifier`/`sklearn.tree.DecisionTreeRegressor` + Raises: + KeyError: if an incorrect argument is passed + """ + + super().__init__() + tree_params = tree_params or {"max_depth": 2} + self.tree_params = tree_params + self.feat_names = None + self.map_thresholds = {} + if mode not in ["single", "multi"]: + raise KeyError( + f"mode, `{mode}` is not valid, please choose in ['single', 'multi']" + ) + self.mode = mode + self.split_unselected_feat = split_unselected_feat + + def fit( + self, + feat_names: List[str], + target: str, + dataframe: pd.DataFrame, + target_continuous: bool, + ) -> "DecisionTreeSupervisedDiscretiserMethod": + """ + The fit method allows DecisionTrees to learn split thresholds from the input data + + Args: + feat_names (List[str]): a list of feature to be discretised + target (str): name of variable that is going to be used a target for the decision tree + dataframe (pd.DataFrame): pandas dataframe of input data + target_continuous (bool): a boolean that indicates if the target variable is continuous + + Returns: + self: DecisionTreeSupervisedDiscretiserMethod object with learned split thresholds from the decision tree + """ + dtree = ( + DecisionTreeRegressor(**self.tree_params) + if target_continuous + else DecisionTreeClassifier(**self.tree_params) + ) + self.feat_names = feat_names + self.map_thresholds = {} + + if self.mode == "single": + for feat in feat_names: + dtree = deepcopy(dtree) + + dtree.fit(dataframe[[feat]], dataframe[[target]]) + thresholds = extract_thresholds_from_dtree(dtree, 1)[0] + self.map_thresholds[feat] = thresholds + + elif self.mode == "multi": + dtree = deepcopy(dtree) + dtree.fit(dataframe[feat_names], dataframe[[target]]) + threshold_list = extract_thresholds_from_dtree(dtree, len(feat_names)) + + for feat, threshold in zip(feat_names, threshold_list): + self.map_thresholds[feat] = threshold + + if self.split_unselected_feat: + for feat in self.map_thresholds: + if self.map_thresholds[feat].size == 0: + dtree = deepcopy(dtree) + dtree.fit(dataframe[[feat]], dataframe[[target]]) + thresholds = extract_thresholds_from_dtree(dtree, 1)[0] + self.map_thresholds[feat] = thresholds + + else: + no_use = [] + for feat in list(self.map_thresholds.keys()): + if self.map_thresholds[feat].size == 0: + no_use.append(feat) + del self.map_thresholds[feat] + if len(no_use) > 0: + logging.warning( + "%s not selected by the decision tree. No discretisation thresholds were learned. " + "Consider setting split_unselected_feat = True or discretise them using single mode", + no_use, + ) + + return self + + +class MDLPSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod): + """Allows discretisation of continuous features using mdlp algorithm + + Example: + :: + >>> import pandas as pd + >>> import numpy as np + >>> from causalnex.discretiser.discretiser_strategy import MDLPSupervisedDiscretiserMethod + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> X, y = iris["data"], iris["target"] + >>> names = iris["feature_names"] + >>> data = pd.DataFrame(X, columns=names) + >>> data["target"] = y + >>> discretiser = MDLPSupervisedDiscretiserMethod( + >>> {"min_depth": 0, "random_state": 2020, "min_split": 1e-3, "dtype": int} + >>> ) + >>> discretiser.fit( + >>> feat_names=["sepal length (cm)"], + >>> dataframe=data, + >>> target="target", + >>> target_continuous=False, + >>> ) + >>> discretised_data = discretiser.transform(data[["sepal length (cm)"]]) + >>> discretised_data.values.ravel() + array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 1, 2, + 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 2, 2, 2, + 1, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, + 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2]) + + """ + + def __init__( + self, + mdlp_args: Dict[str, Any] = None, + ): + """ + This method of discretisation applies MDLP to discretise the data + + Args: + min_depth: The minimum depth of the interval splitting. + min_split: The minmum size to split a bin + dtype: The type of the array returned by the `transform()` method + **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP` + Raises: + ImportError: if mdlp-discretization is not installed successfully + """ + super().__init__() + mdlp_args = mdlp_args or {"min_depth": 0, "min_split": 1e-3, "dtype": int} + self.mdlp_args = mdlp_args + self.feat_names = None + self.map_feat_transformer = {} + if MDLP is None: + raise ImportError( + "mdlp-discretisation was not installed and imported successfully" + ) + self.mdlp = MDLP(**mdlp_args) + + def fit( + self, + feat_names: List[str], + target: str, + dataframe: pd.DataFrame, + target_continuous: bool, + ) -> "MDLPSupervisedDiscretiserMethod": + """ + The fit method allows MDLP to learn split thresholds from the input data. + The target feature cannot be continuous + + Args: + feat_names (List[str]): a list of feature to be discretised + target (str): name of the variable that is going to be used a target for MDLP + dataframe (pd.DataFrame): pandas dataframe of input data + target_continuous (bool): boolean that indicates if target variable is continuous. + + Returns: + self: MDLPSupervisedDiscretiserMethod object with learned split thresholds from mdlp algorithm + + Raises: + ValueError: if the target is continuous + """ + self.feat_names = feat_names + self.map_feat_transformer = {} + if target_continuous: + raise ValueError( + "Target variable should not be continuous when using MDLP." + ) + + for feat in feat_names: + mdlp = deepcopy(self.mdlp) + + mdlp.fit(dataframe[[feat]], dataframe[[target]]) + self.map_thresholds[feat] = mdlp.cut_points_[0] + + return self diff --git a/causalnex/utils/__init__.py b/causalnex/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/causalnex/utils/decision_tree_tools.py b/causalnex/utils/decision_tree_tools.py new file mode 100644 index 0000000..afa6146 --- /dev/null +++ b/causalnex/utils/decision_tree_tools.py @@ -0,0 +1,63 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""Helper functions for advanced discretisations""" + +from typing import List, Union + +import numpy as np +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + + +def extract_thresholds_from_dtree( + dtree: Union[DecisionTreeClassifier, DecisionTreeRegressor], + length_df: int, +) -> List[np.array]: + """A helper function that extracts the decision threshold of a decision tree + + Args: + dtree: A decisiontree model object + length_df (int): length of the target dataframe + + Returns: + a list of numpy array indicating the thersholds for each feature + """ + + tree_threshold = dtree.tree_.threshold + tree_feature = dtree.tree_.feature + + # store decision thresholds of all features in a list + thresholds_for_features = [] + + for feat in range(length_df): + if feat not in tree_feature: + thresholds_for_features.append(np.array([])) + else: + thresholds_for_features.append( + np.unique(tree_threshold[tree_feature == feat]) + ) + return thresholds_for_features diff --git a/setup.py b/setup.py index d68b47f..e71bd4a 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "plot": [ "pygraphviz>=1.5, <2.0", ], + "discretiser": ["mdlp-discretization~=0.3.3"], } extras_require["all"] = sorted(chain.from_iterable(extras_require.values())) diff --git a/test_requirements.txt b/test_requirements.txt index 5dc729b..cb984c9 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -3,6 +3,7 @@ flake8>=3.5, <4.0 ipython>=7.0, <7.17 isort>=4.3.16, <5.0 matplotlib~=3.3 +mdlp-discretization~=0.3.3 mock>=2.0.0, <3.0 pre-commit>=1.17.0, <2.0.0 pygraphviz>=1.5, <2.0 diff --git a/tests/discretiser/conftest.py b/tests/discretiser/conftest.py new file mode 100644 index 0000000..a0bdaa0 --- /dev/null +++ b/tests/discretiser/conftest.py @@ -0,0 +1,99 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import pandas as pd +import pytest +from sklearn.datasets import load_diabetes, load_iris + +from causalnex.discretiser.abstract_discretiser import ( + AbstractSupervisedDiscretiserMethod, +) + + +class Dummy(AbstractSupervisedDiscretiserMethod): + def fit( + self, + feat_names: List[str], + target: str, + dataframe: pd.DataFrame, + target_continuous: bool, + ): + raise NotImplementedError("This is not implemented") + + def learn(self, get_iris_data): + super().fit( + feat_names=["petal width (cm)"], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + def learn_transform(self, get_iris_data): + super().fit_transform( + feat_names=["petal width (cm)"], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + +@pytest.fixture +def get_dummy_class(): + return Dummy() + + +@pytest.fixture +def get_iris_data(): + iris = load_iris() + X, y = iris["data"], iris["target"] + names = iris["feature_names"] + df = pd.DataFrame(X, columns=names) + df["target"] = y + return df + + +@pytest.fixture +def get_diabete_data(): + diabetes = load_diabetes() + X, y = diabetes["data"], diabetes["target"] + names = diabetes["feature_names"] + df = pd.DataFrame(X, columns=names) + df["target"] = y + return df + + +@pytest.fixture +def categorical_data(get_iris_data): + return get_iris_data[["petal width (cm)", "target"]] + + +@pytest.fixture +def continuous_data(get_diabete_data): + return get_diabete_data[["s6", "target"]] diff --git a/tests/discretiser/test_base.py b/tests/discretiser/test_base.py new file mode 100644 index 0000000..9f4778d --- /dev/null +++ b/tests/discretiser/test_base.py @@ -0,0 +1,48 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +class TestBaseClass: + def test_fit_not_implemented(self, get_iris_data, get_dummy_class): + obj = get_dummy_class + with pytest.raises(NotImplementedError): + obj.learn(get_iris_data) + with pytest.raises(NotImplementedError): + obj.fit( + feat_names=["petal width (cm)"], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + def test_fit_transform_not_implemented(self, get_iris_data, get_dummy_class): + obj = get_dummy_class + with pytest.raises(NotImplementedError): + obj.learn_transform(get_iris_data) diff --git a/tests/discretiser/test_decision_tree.py b/tests/discretiser/test_decision_tree.py new file mode 100644 index 0000000..a246e58 --- /dev/null +++ b/tests/discretiser/test_decision_tree.py @@ -0,0 +1,418 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from causalnex.discretiser.discretiser_strategy import ( + DecisionTreeSupervisedDiscretiserMethod, +) + + +class TestDecisionTree: + def test_single_continuous(self, continuous_data): + diabete = continuous_data.copy(deep=True) + + dt_single = DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 2}, + mode="single", + ) + tree_discretiser = dt_single.fit( + feat_names=["s6"], + dataframe=diabete, + target_continuous=True, + target="target", + ) + discretiser_output = tree_discretiser.transform(diabete[["s6"]]).values + + ground_truth = np.array( + [ + [1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0], + [1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 3, 0, 0], + [1, 2, 0, 2, 1, 0, 1, 1, 0, 1, 1, 1, 1], + [1, 2, 1, 0, 2, 1, 0, 0, 0, 1, 1, 1, 1], + [1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1], + [1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1], + [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1], + [1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 1, 1], + [0, 1, 0, 1, 2, 2, 1, 0, 1, 2, 1, 1, 1], + [3, 2, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 2], + [2, 0, 1, 1, 0, 2, 1, 1, 2, 1, 1, 3, 1], + [0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 1, 1, 2], + [0, 1, 0, 1, 0, 2, 1, 2, 1, 1, 0, 2, 3], + [1, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 0], + [1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [1, 1, 2, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2], + [2, 1, 0, 1, 1, 1, 0, 2, 2, 2, 1, 1, 0], + [0, 1, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 0], + [2, 2, 1, 1, 1, 2, 1, 2, 0, 0, 1, 0, 0], + [0, 2, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 1], + [1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 0], + [2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], + [1, 1, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1], + [1, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 1, 1], + [1, 1, 0, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1], + [2, 1, 1, 1, 2, 1, 1, 2, 1, 0, 0, 2, 1], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3], + [0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 1, 2, 1], + [2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1], + [0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0, 0, 0], + [2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 1, 2, 2], + [2, 1, 1, 0, 1, 2, 1, 0, 1, 2, 1, 1, 1], + [2, 1, 1, 1, 2, 1, 1, 0, 1, 0, 2, 1, 3], + [0, 1, 1, 2, 1, 1, 0, 0, 1, 2, 1, 1, 1], + ] + ) # ground truth is generated by manually use DecionTree to extract thresholds + + assert (ground_truth == discretiser_output.reshape(-1, 13)).all() + + def test_single_categorical(self, categorical_data): + df = categorical_data.copy(deep=True) + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) # ground truth is generated by manually use DecionTree to extract thresholds + + dt_single = DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 2}, + mode="single", + ) + tree_discretiser = dt_single.fit( + feat_names=["petal width (cm)"], + dataframe=df, + target_continuous=False, + target="target", + ) + discretiser_output = tree_discretiser.transform(df[["petal width (cm)"]]).values + assert (ground_truth == discretiser_output.reshape(-1, 15)).all() + + def test_invalid_mode(self): + with pytest.raises(KeyError): + DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 2}, mode="invalid" + ) + + def test_transform_no_feature(self, get_iris_data, caplog): + ground_truth = get_iris_data[["sepal width (cm)"]] + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + mode="multi", + split_unselected_feat=False, + tree_params={"max_depth": 3, "random_state": 2020}, + ) + tree_discretiser = dt_multi.fit( + feat_names=["sepal length (cm)", "petal length (cm)"], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"]]) + + assert "The column is left unchanged" in caplog.text + assert all(ground_truth == output) + + def test_keep_unselected_feature(self, get_iris_data): + ground_truth = np.array( + [ + [4, 2, 3, 3, 4, 6, 4, 4, 1, 3, 4, 4, 2, 2, 6], + [6, 6, 4, 5, 5, 4, 4, 4, 3, 4, 2, 4, 4, 4, 3], + [3, 4, 6, 6, 3, 3, 4, 4, 2, 4, 4, 0, 3, 4, 5], + [2, 5, 3, 4, 3, 3, 3, 3, 0, 1, 1, 3, 0, 1, 1], + [0, 2, 0, 1, 1, 3, 2, 1, 0, 1, 3, 1, 1, 1, 1], + [2, 1, 2, 1, 1, 0, 0, 1, 1, 2, 4, 3, 0, 2, 1], + [1, 2, 1, 0, 1, 2, 1, 1, 1, 1, 3, 1, 2, 1, 2], + [2, 1, 1, 1, 4, 3, 1, 2, 1, 1, 3, 2, 5, 1, 0], + [3, 1, 1, 1, 3, 3, 1, 2, 1, 2, 1, 5, 1, 1, 1], + [2, 4, 3, 2, 3, 3, 3, 1, 3, 3, 2, 1, 2, 4, 2], + ] + ) # ground truth is generated by manually use DecionTree to extract thresholds + + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 3, "random_state": 2020}, + mode="multi", + split_unselected_feat=True, + ) + tree_discretiser = dt_multi.fit( + feat_names=[ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"]]).values + assert (ground_truth == output.reshape(-1, 15)).all() + + def test_multi_fit(self, get_iris_data): + ground_truth_petal_length = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2], + [2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) # ground truth is generated by manually use DecionTree to extract thresholds + + ground_truth_petal_width = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) # ground truth is generated by manually use DecionTree to extract thresholds + + iris = get_iris_data.copy(deep=True) + + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 3, "random_state": 2020}, mode="multi" + ) + + tree_discretiser = dt_multi.fit( + feat_names=[ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + dataframe=iris, + target_continuous=False, + target="target", + ) + + discretiser_petal_length = tree_discretiser.transform( + iris[["petal length (cm)"]] + ).values + discretiser_petal_width = tree_discretiser.transform( + iris[["petal width (cm)"]] + ).values + assert ( + ground_truth_petal_length == discretiser_petal_length.reshape(-1, 15) + ).all() + assert ( + ground_truth_petal_width == discretiser_petal_width.reshape(-1, 15) + ).all() + + def test_no_unselected_feature(self, get_iris_data): + ground_truth = get_iris_data[["sepal width (cm)"]] + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + tree_params={"max_depth": 3, "random_state": 2020}, + mode="multi", + split_unselected_feat=False, + ) + tree_discretiser = dt_multi.fit( + feat_names=[ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"]]) + + assert all(ground_truth == output) + + def test_default_args(self): + dt_multi = DecisionTreeSupervisedDiscretiserMethod() + params = dt_multi.get_params() + assert params["tree_params"]["max_depth"] == 2 + + def test_transform_all_single(self, get_iris_data): + data = get_iris_data.copy(deep=True) + sepal_length = np.array( + [ + [1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2], + [2, 1, 1, 2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0], + [0, 1, 1, 2, 1, 1, 2, 1, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 1, 3, 3, 3, 2, 3, 2, 3, 1, 3, 1], + [1, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 3, 2, 3], + [3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2], + [2, 2, 2, 1, 2, 2, 2, 3, 1, 2, 3, 2, 3, 3, 3], + [3, 1, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 2], + [3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2], + [3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2], + ] + ) + sepal_width = np.array( + [ + [2, 1, 1, 1, 2, 3, 2, 2, 0, 1, 2, 2, 1, 1, 3], + [3, 3, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1], + [1, 2, 3, 3, 1, 1, 2, 2, 1, 2, 2, 0, 1, 2, 2], + [1, 2, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0], + [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0], + [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0], + [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1], + [1, 0, 0, 0, 2, 1, 0, 1, 0, 0, 1, 1, 2, 0, 0], + [1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 2, 0, 0, 0], + [1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 2, 1], + ] + ) + petal_length = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1], + [1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) + petal_width = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) + + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + mode="single", tree_params={"max_depth": 2, "random_state": 2020} + ) + tree_discretiser = dt_multi.fit( + feat_names=[ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + dataframe=data, + target="target", + target_continuous=False, + ) + output_df = tree_discretiser.transform(data) + + assert ( + output_df["sepal length (cm)"].values.reshape(-1, 15) == sepal_length + ).all() + assert ( + output_df["sepal width (cm)"].values.reshape(-1, 15) == sepal_width + ).all() + assert ( + output_df["petal length (cm)"].values.reshape(-1, 15) == petal_length + ).all() + assert ( + output_df["petal width (cm)"].values.reshape(-1, 15) == petal_width + ).all() + + def test_transform_all_multi(self, get_iris_data): + data = get_iris_data.copy(deep=True) + sepal_length = data["sepal length (cm)"] + sepal_width = data["sepal width (cm)"] + petal_length = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2], + [2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) + petal_width = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) + + dt_multi = DecisionTreeSupervisedDiscretiserMethod( + mode="multi", tree_params={"max_depth": 3, "random_state": 2020} + ) + tree_discretiser = dt_multi.fit( + feat_names=[ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ], + dataframe=data, + target="target", + target_continuous=False, + ) + output_df = tree_discretiser.transform(data) + + assert (output_df["sepal length (cm)"].values == sepal_length).all() + assert (output_df["sepal width (cm)"].values == sepal_width).all() + assert ( + output_df["petal length (cm)"].values.reshape(-1, 15) == petal_length + ).all() + assert ( + output_df["petal width (cm)"].values.reshape(-1, 15) == petal_width + ).all() diff --git a/tests/discretiser/test_mdlp.py b/tests/discretiser/test_mdlp.py new file mode 100644 index 0000000..730a452 --- /dev/null +++ b/tests/discretiser/test_mdlp.py @@ -0,0 +1,108 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +from importlib import reload + +import numpy as np +import pytest +from mock import patch + +from causalnex.discretiser import discretiser_strategy +from causalnex.discretiser.discretiser_strategy import MDLPSupervisedDiscretiserMethod + + +class TestMDLP: + def test_output(self, get_iris_data): + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0], + [0, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2], + [2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 2, 2, 2, 1, 1], + [1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2], + [2, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2], + [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2], + ] + ) # ground_truth is generated by manually use MDLP to extract thresholds + + discretiser = MDLPSupervisedDiscretiserMethod( + {"min_depth": 0, "random_state": 2020, "min_split": 1e-3, "dtype": int} + ) + + discretiser.fit( + feat_names=["sepal length (cm)"], + dataframe=get_iris_data, + target_continuous=False, + target="target", + ) + + output = discretiser.transform(get_iris_data[["sepal length (cm)"]]).values + assert (output.reshape(-1, 15) == ground_truth).all() + + def test_target_continuous(self, get_iris_data): + + discretiser = MDLPSupervisedDiscretiserMethod( + {"min_depth": 0, "random_state": 17, "min_split": 1e-3, "dtype": int} + ) + + with pytest.raises(ValueError): + discretiser.fit( + feat_names=["sepal length (cm)"], + dataframe=get_iris_data, + target_continuous=True, + target="target", + ) + + def test_warning_import(self, caplog): + with patch.dict("sys.modules", {"mdlp.discretization": None}): + reload(discretiser_strategy) + reload(discretiser_strategy) + assert "MDLP was not imported successfully" in caplog.text + + def test_import_error(self): + with patch.dict("sys.modules", {"mdlp.discretization": None}): + reload(discretiser_strategy) + with pytest.raises(ImportError): + discretiser_strategy.MDLPSupervisedDiscretiserMethod( + { + "min_depth": 0, + "random_state": 2020, + "min_split": 1e-3, + "dtype": int, + } + ) + reload(discretiser_strategy) + + def test_default_args(self): + discretiser = MDLPSupervisedDiscretiserMethod() + params = discretiser.get_params() + assert params["mdlp_args"]["min_depth"] == 0 + assert params["mdlp_args"]["min_split"] == 1e-3 + assert params["mdlp_args"]["dtype"] == int From 17f22850d2fe73a219bd165cdacd032b93365fdb Mon Sep 17 00:00:00 2001 From: hiepnguyen034 Date: Tue, 11 May 2021 21:55:33 +0700 Subject: [PATCH 2/4] Feature/sklearn compatibility (#153) Co-authored-by: philip_pilgerstorfer Co-authored-by: Zain Patel --- RELEASE.md | 5 +- causalnex/discretiser/discretiser_strategy.py | 1 + causalnex/network/sklearn/__init__.py | 36 +++ causalnex/network/sklearn/models.py | 294 +++++++++++++++++ tests/conftest.py | 36 +++ tests/discretiser/test_decision_tree.py | 1 - tests/test_network_model.py | 298 ++++++++++++++++++ 7 files changed, 668 insertions(+), 3 deletions(-) create mode 100644 causalnex/network/sklearn/__init__.py create mode 100644 causalnex/network/sklearn/models.py create mode 100644 tests/test_network_model.py diff --git a/RELEASE.md b/RELEASE.md index 9e8e5f3..f900934 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,12 +1,13 @@ # Upcoming release # Release 0.10.0 +* Add supervised discretisation strategies using Decision Tree and MDLP algorithms. +* Add `BayesianNetworkClassifier` an sklearn compatible class for fitting and predicting probabilities in a BN. +* Fixes cyclical import of `causalnex.plots`, as per #106. * Add utility function to extract Markov blanket from a Bayesian Network * Support receiving a list of inputs for `InferenceEngine` with a multiprocessing option -* Fixes cyclical import of `causalnex.plots`, as per #106 * Add supervised discretisation strategies using Decision Tree and MDLP algorithms - # Release 0.9.2 * Remove Boston housing dataset from "sklearn tutorial", see #91 for more information. * Update pylint version to 2.7 diff --git a/causalnex/discretiser/discretiser_strategy.py b/causalnex/discretiser/discretiser_strategy.py index 8431ec1..7b50c29 100644 --- a/causalnex/discretiser/discretiser_strategy.py +++ b/causalnex/discretiser/discretiser_strategy.py @@ -225,6 +225,7 @@ class MDLPSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod): >>> ) >>> discretised_data = discretiser.transform(data[["sepal length (cm)"]]) >>> discretised_data.values.ravel() + array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 1, 2, diff --git a/causalnex/network/sklearn/__init__.py b/causalnex/network/sklearn/__init__.py new file mode 100644 index 0000000..b6d8ad2 --- /dev/null +++ b/causalnex/network/sklearn/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.network.sklearn`` provides functionality to learn joint probability +distribution of networks with sklearn compatibility. +""" + +__all__ = ["BayesianNetworkClassifier"] + +from .models import BayesianNetworkClassifier diff --git a/causalnex/network/sklearn/models.py b/causalnex/network/sklearn/models.py new file mode 100644 index 0000000..3e76408 --- /dev/null +++ b/causalnex/network/sklearn/models.py @@ -0,0 +1,294 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module contains the implementation of ``BayesianNetworkClassifier``. + +``BayesianNetworkClassifier`` is a class that supports learning CPDs from input data +and making predictions +""" + +import logging +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin + +from causalnex.discretiser import Discretiser +from causalnex.discretiser.discretiser_strategy import ( + DecisionTreeSupervisedDiscretiserMethod, + MDLPSupervisedDiscretiserMethod, +) +from causalnex.network import BayesianNetwork +from causalnex.structure import StructureModel + + +class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin): + """ + A class that supports discretising features and probability fitting with scikit-learn syntax + + Example: + :: + # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance + >>> import pandas as pd + >>> import numpy as np + >>> from sklearn.preprocessing import LabelEncoder + >>> from causalnex.discretiser import Discretiser + >>> from causalnex.network.sklearn import BayesianNetworkClassifier + >>> from sklearn.model_selection import train_test_split + >>> data = pd.read_csv('student-por.csv', delimiter=';') + >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian'] + >>> data = data.drop(columns=drop_col) + >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns) + >>> le = LabelEncoder() + >>> for col in non_numeric_columns: + >>> data[col] = le.fit_transform(data[col]) + >>> data["G3"] = Discretiser(method="fixed", + numeric_split_points=[10]).transform(data["G3"].values) + >>> label = data["G3"] + >>> data.drop(['G3'], axis=1, inplace=True) + >>> X_train, X_test, y_train, y_test = train_test_split( + data, label, test_size=0.1, random_state=7) + >>> edge_list = [('address', 'absences'), + ('Pstatus', 'famrel'), + ('Pstatus', 'absences'), + ('studytime', 'G1'), + ('G1', 'G2'), + ('failures', 'absences'), + ('failures', 'G1'), + ('schoolsup', 'G1'), + ('paid', 'absences'), + ('higher', 'famrel'), + ('higher', 'G1'), + ('internet', 'absences'), + ('G2', 'G3')] + >>> discretiser_param = { + 'absences': {'method':"fixed", + 'numeric_split_points':[1, 10] + }, + 'G1': {'method':"fixed", + 'numeric_split_points':[10] + }, + 'G2': {'method':"fixed", + 'numeric_split_points':[10] + } + } + >>> discretiser_alg = {'absences': 'unsupervised', + 'G1': 'unsupervised', + 'G2': 'unsupervised' + } + >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"} + >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param) + >>> clf.fit(X_train, y_train) + >>> clf.predict(X_test) + array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, + 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0]) + + """ + + def __init__( + self, + list_of_edges: List[Tuple[str]], + discretiser_alg: Optional[Dict[str, str]] = None, + discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, + probability_kwargs: Dict[str, Dict[str, Any]] = None, + return_prob: bool = False, + ): + """ + Args: + list_of_edges (list): Edge list to construct graph + - if True: return pandas dataframe with predicted probability for each state + - if False: return a 1-D prediction array + discretiser_alg (dict): Specify a supervised algorithm to discretise + each feature in the data. Available options for the dictionary values + are ['unsupervised', 'tree', 'mdlp'] + - if 'unsupervised': discretise the data using unsupervised method + - if 'tree': discretise the data using decision tree method + - if 'mdlp': discretise the data using MDLP method + discretiser_kwargs (dict): Keyword arguments for discretisation methods. + Only applicable if discretiser_alg is not None. + probability_kwargs (dict): keyword arguments for the probability model + return_prob (bool): choose to return predictions or probability + + Raises: + KeyError: If an incorrect argument is passed + ValueError: If the keys in discretiser_alg and discretiser_kwargs differ + """ + + probability_kwargs = probability_kwargs or { + "method": "BayesianEstimator", + "bayes_prior": "K2", + } + + if discretiser_alg is None: + logging.info( + "No discretiser algorithm was given " + "The training data will not be discretised" + ) + discretiser_alg = {} + + discretiser_kwargs = discretiser_kwargs or {} + + self._validate_discretiser(discretiser_alg, discretiser_kwargs) + + self.structure = StructureModel(list_of_edges) + self.bn = BayesianNetwork(self.structure) + self.return_prob = return_prob + self.probability_kwargs = probability_kwargs + self.discretiser_kwargs = discretiser_kwargs + self.discretiser_alg = discretiser_alg + self._target_name = None + self._discretise_data = None + + @staticmethod + def _validate_discretiser(discretiser_alg, discretiser_kwargs): + unavailable_discretiser_algs = { + k: v not in ["unsupervised", "tree", "mdlp"] + for k, v in discretiser_alg.items() + } + + if any(unavailable_discretiser_algs.values()): + raise KeyError( + "Some discretiser algorithms are not supported: `{:}`. " + "Please choose in ['unsupervised', 'tree', 'mdlp']".format( + { + k: discretiser_alg[k] + for k, v in unavailable_discretiser_algs.items() + if v + } + ) + ) + + if set(discretiser_kwargs) != set(discretiser_alg): + raise ValueError( + "discretiser_alg and discretiser_kwargs should have the same keys" + ) + + def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Helper method to discretise input data using parameters in + `discretiser_kwargs` and `discretiser_alg`. + The splitting thresholds are extracted from the training data + + Args: + X (pd.DataFrame): a dataframe to be discretised + + Returns: + a discretised version of the input dataframe + """ + + X = X.copy() + + for col in self.discretiser_alg.keys(): + + if self.discretiser_alg[col] == "unsupervised": + + if self.discretiser_kwargs[col]["method"] == "fixed": + X[col] = Discretiser(**self.discretiser_kwargs[col]).transform( + X[col].values + ) + else: + discretiser = Discretiser(**self.discretiser_kwargs[col]).fit( + self._discretise_data[col].values + ) + X[col] = discretiser.transform(X[col].values) + + else: + if self.discretiser_alg[col] == "tree": + discretiser = DecisionTreeSupervisedDiscretiserMethod( + mode="single", tree_params=self.discretiser_kwargs[col] + ) + + elif self.discretiser_alg[col] == "mdlp": + discretiser = MDLPSupervisedDiscretiserMethod( + self.discretiser_kwargs[col] + ) + + discretiser.fit( + dataframe=self._discretise_data, + feat_names=[col], + target=self._target_name, + target_continuous=False, + ) + + X[col] = discretiser.transform(X[[col]]) + + return X + + def fit(self, X: pd.DataFrame, y: pd.Series) -> "BayesianNetworkClassifier": + """ + Build a Bayesian Network classifier from a set of training data. + The method first discretises the feature using parameters in `discretiser_kwargs` + and `discretiser_alg`. Next, it learns all the possible nodes that each feature + can have. Finally, it learns the CPDs of the Bayesian Network. + + Args: + X (pd.DataFrame): input training data + y (pd.Series): categorical label for each row of X + + Returns: + self + """ + self._discretise_data = X.copy() + self._discretise_data[y.name] = y + self._target_name = y.name + X = self._discretise_features(X) + + X[y.name] = y + self.bn = self.bn.fit_node_states(X) + self.bn = self.bn.fit_cpds(X, **self.probability_kwargs) + + return self + + def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]: + """ + Return predictions for the input data + + Args: + X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict + + Returns: + Model's prediction: A numpy array of shape (num_row,) + + Raises: + ValueError: if CPDs are empty + + """ + if self.bn.cpds == {}: + raise ValueError("No CPDs found. The model has not been fitted") + + X = self._discretise_features(X) + + if self.return_prob: + pred = self.bn.predict_probability(X, self._target_name) + else: + pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1) + + return pred diff --git a/tests/conftest.py b/tests/conftest.py index 9996b06..03702a5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,7 +32,9 @@ import pandas as pd import pytest from pgmpy.models import BayesianModel +from sklearn.datasets import load_iris +from causalnex.discretiser import Discretiser from causalnex.network import BayesianNetwork from causalnex.structure import StructureModel from causalnex.structure.notears import from_pandas @@ -1034,3 +1036,37 @@ def adjacency_mat_num_stability() -> np.ndarray: ] ) return W + + +@pytest.fixture +def iris_test_data() -> pd.DataFrame: + """ + Iris dataset to test sklearn wrappers + """ + iris = load_iris() + X, y = iris["data"], iris["target"] + names = iris["feature_names"] + df = pd.DataFrame(X, columns=names) + df["type"] = y + df["sepal length (cm)"] = Discretiser( + method="quantile", num_buckets=3 + ).fit_transform(df["sepal length (cm)"].values) + + return df + + +@pytest.fixture +def iris_edge_list(): + """ + Edge list to construct bayesian network for iris data + """ + edge_list = [ + ("sepal width (cm)", "sepal length (cm)"), + ("petal length (cm)", "sepal length (cm)"), + ("petal length (cm)", "sepal width (cm)"), + ("petal width (cm)", "petal length (cm)"), + ("type", "sepal width (cm)"), + ("type", "petal width (cm)"), + ] + + return edge_list diff --git a/tests/discretiser/test_decision_tree.py b/tests/discretiser/test_decision_tree.py index a246e58..a688722 100644 --- a/tests/discretiser/test_decision_tree.py +++ b/tests/discretiser/test_decision_tree.py @@ -178,7 +178,6 @@ def test_keep_unselected_feature(self, get_iris_data): target_continuous=False, target="target", ) - output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"]]).values assert (ground_truth == output.reshape(-1, 15)).all() diff --git a/tests/test_network_model.py b/tests/test_network_model.py new file mode 100644 index 0000000..be1075f --- /dev/null +++ b/tests/test_network_model.py @@ -0,0 +1,298 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import pytest + +from causalnex.network.sklearn import BayesianNetworkClassifier + + +class TestBayesianCPDs: + def test_default_params(self): + edge_list = [ + ("b", "a"), + ("b", "c"), + ("d", "a"), + ("d", "c"), + ("d", "b"), + ("e", "c"), + ("e", "b"), + ] + clf = BayesianNetworkClassifier(edge_list) + params = clf.get_params() + assert params["discretiser_alg"] == {} + assert params["probability_kwargs"]["method"] == "BayesianEstimator" + assert params["probability_kwargs"]["bayes_prior"] == "K2" + assert params["discretiser_kwargs"] == {} + + def test_predict_quantile(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 1, 1, 1, 2, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [2, 1, 2, 1, 1, 1, 1, 2, 1, 1], + [1, 1, 1, 2, 1, 2, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 1, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 1, 2, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 1, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ] + ) + + discretiser_params = { + "sepal width (cm)": {"method": "quantile", "num_buckets": 3}, + "petal length (cm)": {"method": "quantile", "num_buckets": 3}, + "petal width (cm)": {"method": "quantile", "num_buckets": 3}, + } + + label = df["sepal length (cm)"] + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier( + iris_edge_list, + discretiser_kwargs=discretiser_params, + discretiser_alg={ + "sepal width (cm)": "unsupervised", + "petal length (cm)": "unsupervised", + "petal width (cm)": "unsupervised", + }, + ) + clf.fit(df, label) + output = clf.predict(df) + assert np.array_equal(output.reshape(15, -1), ground_truth) + + def test_predict_fixed(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 1, 1, 1, 2, 0, 1, 1], + [0, 2, 1, 1, 0, 2, 2, 1, 1, 1], + [2, 1, 1, 1, 1, 2, 1, 2, 1, 0], + [1, 1, 1, 1, 2, 2, 2, 1, 2, 1], + [1, 2, 1, 0, 1, 2, 1, 1, 0, 1], + [2, 1, 2, 1, 2, 2, 1, 1, 1, 2], + [2, 1, 2, 1, 1, 2, 2, 2, 1, 1], + [2, 1, 1, 1, 2, 2, 1, 2, 1, 2], + [1, 2, 1, 1, 1, 2, 2, 2, 2, 2], + [2, 2, 1, 2, 2, 2, 1, 2, 2, 2], + ] + ) + + discretiser_params = { + "sepal width (cm)": {"method": "fixed", "numeric_split_points": [3]}, + "petal length (cm)": {"method": "fixed", "numeric_split_points": [3.7]}, + "petal width (cm)": {"method": "fixed", "numeric_split_points": [1.2]}, + } + + label = df["sepal length (cm)"] + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier( + iris_edge_list, + discretiser_kwargs=discretiser_params, + discretiser_alg={ + "sepal width (cm)": "unsupervised", + "petal length (cm)": "unsupervised", + "petal width (cm)": "unsupervised", + }, + ) + clf.fit(df, label) + output = clf.predict(df) + assert np.array_equal(output.reshape(15, -1), ground_truth) + + def test_return_probability(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + + discretiser_params = { + "sepal width (cm)": {"method": "fixed", "numeric_split_points": [3]}, + "petal length (cm)": {"method": "fixed", "numeric_split_points": [3.7]}, + "petal width (cm)": {"method": "fixed", "numeric_split_points": [1.2]}, + } + + label = df["sepal length (cm)"] + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier( + iris_edge_list, + discretiser_kwargs=discretiser_params, + discretiser_alg={ + "sepal width (cm)": "unsupervised", + "petal length (cm)": "unsupervised", + "petal width (cm)": "unsupervised", + }, + return_prob=True, + ) + clf.fit(df, label) + output = clf.predict(df.iloc[0:1]) + assert len(list(output)) == 3 + assert math.isclose( + output["sepal length (cm)_0"].values, 0.764706, abs_tol=1e-3 + ) + assert math.isclose( + output["sepal length (cm)_1"].values, 0.215686, abs_tol=1e-3 + ) + + def test_no_fit(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier(iris_edge_list) + with pytest.raises( + ValueError, + match="No CPDs found. The model has not been fitted", + ): + clf.predict(df) + + def test_dt_discretiser(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [2, 2, 2, 1, 1, 1, 2, 0, 1, 1], + [0, 1, 1, 1, 1, 2, 1, 1, 1, 1], + [2, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 2, 1, 1, 1], + [1, 1, 1, 0, 1, 1, 1, 1, 0, 1], + [2, 1, 2, 2, 2, 2, 1, 2, 2, 2], + [2, 2, 2, 1, 1, 2, 2, 2, 2, 1], + [2, 1, 2, 1, 2, 2, 1, 1, 2, 2], + [2, 2, 2, 1, 2, 2, 2, 2, 1, 2], + [2, 2, 1, 2, 2, 2, 1, 2, 2, 1], + ] + ) + supervised_param = { + "sepal width (cm)": {"max_depth": 2, "random_state": 2020}, + "petal length (cm)": {"max_depth": 2, "random_state": 2020}, + "petal width (cm)": {"max_depth": 2, "random_state": 2020}, + } + + label = df["sepal length (cm)"] + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier( + iris_edge_list, + discretiser_kwargs=supervised_param, + discretiser_alg={ + "sepal width (cm)": "tree", + "petal length (cm)": "tree", + "petal width (cm)": "tree", + }, + ) + clf.fit(df, label) + output = clf.predict(df) + assert np.array_equal(output.reshape(15, -1), ground_truth) + + def test_mdlp_discretiser(self, iris_test_data, iris_edge_list): + df = iris_test_data.copy() + ground_truth = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 1, 1, 0, 2, 1, 1, 1, 0], + [2, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 2, 1, 1, 2, 1, 1, 1], + [1, 1, 1, 0, 1, 1, 1, 1, 0, 1], + [2, 2, 2, 2, 2, 2, 1, 2, 2, 2], + [2, 2, 2, 1, 2, 2, 2, 2, 2, 1], + [2, 1, 2, 1, 2, 2, 1, 1, 2, 2], + [2, 2, 2, 2, 2, 2, 2, 2, 1, 2], + [2, 2, 2, 2, 2, 2, 1, 2, 2, 2], + ] + ) + supervised_param = { + "sepal width (cm)": {"min_depth": 0, "random_state": 2020}, + "petal length (cm)": {"min_depth": 0, "random_state": 2020}, + "petal width (cm)": {"min_depth": 0, "random_state": 2020}, + } + label = df["sepal length (cm)"] + df.drop(["sepal length (cm)"], axis=1, inplace=True) + clf = BayesianNetworkClassifier( + iris_edge_list, + discretiser_alg={ + "sepal width (cm)": "mdlp", + "petal length (cm)": "mdlp", + "petal width (cm)": "mdlp", + }, + discretiser_kwargs=supervised_param, + ) + clf.fit(df, label) + output = clf.predict(df) + assert np.array_equal(output.reshape(15, -1), ground_truth) + + def test_invalid_algorithm(self, iris_edge_list): + + with pytest.raises( + KeyError, match="Some discretiser algorithms are not supported" + ): + BayesianNetworkClassifier( + iris_edge_list, + discretiser_alg={ + "sepal width (cm)": "invalid", + "petal length (cm)": "invalid", + "petal width (cm)": "mdlp", + }, + ) + + def test_missing_kwargs(self, iris_edge_list): + supervised_param = { + "sepal width (cm)": {"min_depth": 0, "random_state": 2020}, + "petal length (cm)": {"min_depth": 0, "random_state": 2020}, + } + discretiser_alg = { + "sepal width (cm)": "tree", + "petal length (cm)": "tree", + "petal width (cm)": "mdlp", + } + with pytest.raises( + ValueError, + match="discretiser_alg and discretiser_kwargs should have the same keys", + ): + BayesianNetworkClassifier( + iris_edge_list, + discretiser_alg=discretiser_alg, + discretiser_kwargs=supervised_param, + ) From c9f306ec4571dbb61586ee97f39c80612eb861e6 Mon Sep 17 00:00:00 2001 From: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> Date: Tue, 11 May 2021 17:54:22 +0100 Subject: [PATCH 3/4] add MANIFEST.in (#160) * add MANIFEST.in * newline * add release note Co-authored-by: philip_pilgerstorfer --- MANIFEST.in | 5 +++++ RELEASE.md | 1 + 2 files changed, 6 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..3f30d1b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include README.md +include LICENSE.md +include legal_header.txt +include requirements.txt +include test_requirements.txt diff --git a/RELEASE.md b/RELEASE.md index f900934..16ddf87 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -7,6 +7,7 @@ * Add utility function to extract Markov blanket from a Bayesian Network * Support receiving a list of inputs for `InferenceEngine` with a multiprocessing option * Add supervised discretisation strategies using Decision Tree and MDLP algorithms +* Added manifest files to ensure requirements and licenses are packaged # Release 0.9.2 * Remove Boston housing dataset from "sklearn tutorial", see #91 for more information. From d5ef28f1a12f3e6c74db6a45e5d8d1a6288d09a3 Mon Sep 17 00:00:00 2001 From: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> Date: Tue, 11 May 2021 18:47:46 +0100 Subject: [PATCH 4/4] Inofficial python 3.9 support (#161) Co-authored-by: philip_pilgerstorfer --- RELEASE.md | 2 ++ causalnex/network/sklearn/models.py | 3 ++- causalnex/structure/pytorch/sklearn/_base.py | 16 ++++++++-------- causalnex/structure/pytorch/sklearn/clf.py | 6 +++--- causalnex/structure/pytorch/sklearn/reg.py | 4 ++-- causalnex/utils/network_utils.py | 2 +- requirements.txt | 6 +++--- tests/structure/test_sklearn.py | 20 ++++++++++++++++++++ 8 files changed, 41 insertions(+), 18 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 16ddf87..eeb3ac1 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -8,6 +8,8 @@ * Support receiving a list of inputs for `InferenceEngine` with a multiprocessing option * Add supervised discretisation strategies using Decision Tree and MDLP algorithms * Added manifest files to ensure requirements and licenses are packaged +* Fix estimator issues with sklearn ("unofficial python 3.9 support", doesn't work with `discretiser` option) +* Minor bumps in dependency versions, remove prettytable as dependency # Release 0.9.2 * Remove Boston housing dataset from "sklearn tutorial", see #91 for more information. diff --git a/causalnex/network/sklearn/models.py b/causalnex/network/sklearn/models.py index 3e76408..98f925e 100644 --- a/causalnex/network/sklearn/models.py +++ b/causalnex/network/sklearn/models.py @@ -158,7 +158,8 @@ def __init__( self._validate_discretiser(discretiser_alg, discretiser_kwargs) - self.structure = StructureModel(list_of_edges) + self.list_of_edges = list_of_edges + self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs diff --git a/causalnex/structure/pytorch/sklearn/_base.py b/causalnex/structure/pytorch/sklearn/_base.py index 3c86528..adbee21 100644 --- a/causalnex/structure/pytorch/sklearn/_base.py +++ b/causalnex/structure/pytorch/sklearn/_base.py @@ -76,7 +76,7 @@ def __init__( enforce_dag: bool = False, standardize: bool = False, target_dist_type: str = None, - **kwargs, + notears_mlp_kwargs: Dict = None, ): """ Args: @@ -120,7 +120,7 @@ def __init__( The L-BFGS algorithm used to fit the underlying NOTEARS works best on data all of the same scale so this parameter is reccomended. - kwargs: Extra arguments passed to the NOTEARS from_pandas function. + notears_mlp_kwargs: Additional arguments for the NOTEARS MLP model. target_dist_type: The distribution type of the target. Uses the same aliases as dist_type_schema. @@ -162,8 +162,8 @@ def __init__( self.tabu_edges = tabu_edges self.tabu_parent_nodes = tabu_parent_nodes self.tabu_child_nodes = tabu_child_nodes - self._target_dist_type = target_dist_type - self.kwargs = kwargs + self.target_dist_type = target_dist_type + self.notears_mlp_kwargs = notears_mlp_kwargs # sklearn wrapper paramters self.dependent_target = dependent_target @@ -206,14 +206,14 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray ) # if its a continuous target also standardize - if self._target_dist_type == "cont": + if self.target_dist_type == "cont": y = y.copy() self._ss_y = StandardScaler() y[:] = self._ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1) # add the target to the dist_type_schema # NOTE: this must be done AFTER standardize - dist_type_schema[y.name] = self._target_dist_type + dist_type_schema[y.name] = self.target_dist_type # preserve the feature and target colnames self._features = tuple(X.columns) @@ -242,7 +242,7 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray tabu_parent_nodes=tabu_parent_nodes, tabu_child_nodes=self.tabu_child_nodes, use_bias=self.fit_intercept, - **self.kwargs, + **(self.notears_mlp_kwargs or {}), ) # keep thresholding until the DAG constraint is enforced @@ -284,7 +284,7 @@ def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: y_pred = target_dist_type.get_columns(X_hat) # inverse-standardize - if self.standardize and self._target_dist_type == "cont": + if self.standardize and self.target_dist_type == "cont": y_pred = self._ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1) return y_pred diff --git a/causalnex/structure/pytorch/sklearn/clf.py b/causalnex/structure/pytorch/sklearn/clf.py index a1a07a5..752e158 100644 --- a/causalnex/structure/pytorch/sklearn/clf.py +++ b/causalnex/structure/pytorch/sklearn/clf.py @@ -83,7 +83,7 @@ def fit( Fits the sm model using the concat of X and y. Raises: - NotImplementedError: If unsupported _target_dist_type provided. + NotImplementedError: If unsupported target_dist_type provided. ValueError: If less than 2 classes provided. Returns: @@ -109,8 +109,8 @@ def fit( ) # store the protected attr _target_dist_type - if self._target_dist_type is None: - self._target_dist_type = "cat" if n_classes > 2 else "bin" + if self.target_dist_type is None: + self.target_dist_type = "cat" if n_classes > 2 else "bin" # fit the NOTEARS model super().fit(X, y) diff --git a/causalnex/structure/pytorch/sklearn/reg.py b/causalnex/structure/pytorch/sklearn/reg.py index e4a8227..2d80358 100644 --- a/causalnex/structure/pytorch/sklearn/reg.py +++ b/causalnex/structure/pytorch/sklearn/reg.py @@ -86,8 +86,8 @@ def fit( """ # store the protected attr _target_dist_type - if self._target_dist_type is None: - self._target_dist_type = "cont" + if self.target_dist_type is None: + self.target_dist_type = "cont" # fit the NOTEARS model super().fit(X, y) diff --git a/causalnex/utils/network_utils.py b/causalnex/utils/network_utils.py index ac19c06..8d33ca6 100644 --- a/causalnex/utils/network_utils.py +++ b/causalnex/utils/network_utils.py @@ -26,7 +26,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -This module contains the helpfer functions for interacting with Bayesian Network +This module contains the helper functions for interacting with Bayesian Network """ from copy import deepcopy diff --git a/requirements.txt b/requirements.txt index b4c6a17..50752ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,9 @@ networkx~=2.5 numpy>=1.14.2, <2.0 pandas>=1.0, <2.0 pathos>=0.2.7, <0.3.0 -pgmpy>=0.1.12,<0.2.0 -prettytable>=0.7.2, <0.8 -scikit-learn>=0.20.2, <0.23.0, !=0.22.2.post1 +pgmpy>=0.1.12, <0.2.0 +scikit-learn>=0.22.0, <0.25.0, !=0.22.2.post1, !=0.24.1; python_version < '3.9' +scikit-learn>=0.24.0, <0.25.0, !=0.24.1; python_version == '3.9' scipy>=1.2.0, <1.6 torch~=1.7 wrapt>=1.11.0, <1.12 diff --git a/tests/structure/test_sklearn.py b/tests/structure/test_sklearn.py index 21790c3..a28c2ef 100644 --- a/tests/structure/test_sklearn.py +++ b/tests/structure/test_sklearn.py @@ -243,6 +243,16 @@ def test_feature_importances(self, hidden_layer_units): # assert that the sign of the coefficient is positive for both nonlinear and linear cases assert coef_["true_feat"] > 0 + def test_sklearn_compatibility_reg(self): + reg = DAGRegressor( + alpha=0.0, + fit_intercept=True, + dependent_target=True, + hidden_layer_units=[0], + standardize=True, + ) + reg.get_params(deep=True) + @pytest.mark.parametrize( "standardize", [ @@ -463,6 +473,16 @@ def test_glm(self, target_dist_type, y): clf.fit(X, y) clf.predict(X) + def test_sklearn_compatibility_clf(self): + clf = DAGClassifier( + alpha=0.0, + fit_intercept=True, + dependent_target=True, + hidden_layer_units=[0], + standardize=True, + ) + clf.get_params(deep=True) + @pytest.mark.parametrize("hidden_layer_units", [None, [1], [5], [5, 3], [10, 10]]) def test_independent_predictions(hidden_layer_units):