Merge pull request #111 from quantumblacklabs/release/0.10.0

Release/0.10.0
mckinsey · May 11, 2021 · b6a399f · b6a399f
2 parents b64dab0 + 2f57e47
commit b6a399f
Show file tree

Hide file tree

Showing 30 changed files with 2,058 additions and 32 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
  exclude: ^causalnex/ebaybbn
 
 - repo: https://github.com/psf/black
- rev: stable
+ rev: 20.8b1
  hooks:
  - id: black
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,5 @@
+include README.md
+include LICENSE.md
+include legal_header.txt
+include requirements.txt
+include test_requirements.txt
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,5 +1,16 @@
 # Upcoming release
 
+# Release 0.10.0
+* Add supervised discretisation strategies using Decision Tree and MDLP algorithms.
+* Add `BayesianNetworkClassifier` an sklearn compatible class for fitting and predicting probabilities in a BN.
+* Fixes cyclical import of `causalnex.plots`, as per #106.
+* Add utility function to extract Markov blanket from a Bayesian Network
+* Support receiving a list of inputs for `InferenceEngine` with a multiprocessing option
+* Add supervised discretisation strategies using Decision Tree and MDLP algorithms
+* Added manifest files to ensure requirements and licenses are packaged
+* Fix estimator issues with sklearn ("unofficial python 3.9 support", doesn't work with `discretiser` option)
+* Minor bumps in dependency versions, remove prettytable as dependency
+
 # Release 0.9.2
 * Remove Boston housing dataset from "sklearn tutorial", see #91 for more information.
 * Update pylint version to 2.7

diff --git a/causalnex/__init__.py b/causalnex/__init__.py
@@ -30,6 +30,6 @@
 causalnex toolkit for causal reasoning (Bayesian Networks / Inference)
 """
 
-__version__ = "0.9.2"
+__version__ = "0.10.0"
 
 __all__ = ["structure", "discretiser", "evaluation", "inference", "network", "plots"]
diff --git a/causalnex/discretiser/__init__.py b/causalnex/discretiser/__init__.py
@@ -30,6 +30,14 @@
 ``causalnex.discretiser`` provides functionality to discretise data.
 """
 
-__all__ = ["Discretiser"]
+__all__ = [
+ "Discretiser",
+ "DecisionTreeSupervisedDiscretiserMethod",
+ "MDLPSupervisedDiscretiserMethod",
+]
 
 from .discretiser import Discretiser
+from .discretiser_strategy import (
+ DecisionTreeSupervisedDiscretiserMethod,
+ MDLPSupervisedDiscretiserMethod,
+)
diff --git a/causalnex/discretiser/abstract_discretiser.py b/causalnex/discretiser/abstract_discretiser.py
@@ -0,0 +1,114 @@
+# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
+# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
+# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
+# (either separately or in combination, "QuantumBlack Trademarks") are
+# trademarks of QuantumBlack. The License does not grant you any right or
+# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
+# Trademarks or any confusingly similar mark as a trademark for your product,
+# or use the QuantumBlack Trademarks in any other manner that might cause
+# confusion in the marketplace, including but not limited to in advertising,
+# on websites, or on software.
+#
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tools to help discretise data."""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import List
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+
+
+class AbstractSupervisedDiscretiserMethod(BaseEstimator, ABC):
+ """
+ Base class for advanced discretisation methods
+
+ """
+
+ def __init__(self):
+ self.map_thresholds = {}
+ self.feat_names = None
+
+ @abstractmethod
+ def fit(
+ self,
+ feat_names: List[str],
+ target: str,
+ dataframe: pd.DataFrame,
+ target_continuous: bool,
+ ):
+ """
+ Discretise the features in `feat_names` in such a way that maximises the prediction of `target`.
+
+ Args:
+ feat_names (List[str]): List of feature names to be discretised.
+ target (str): Name of the target variable - the node that adjusts how `feat_names` will be discretised
+ dataframe: The full dataset prior to discretisation.
+ target_continuous (bool): Boolean indicates if target variable is continuous
+ Raises:
+ NotImplementedError: AbstractSupervisedDiscretiserMethod should not be called directly
+
+ """
+ raise NotImplementedError("The method is not implemented")
+
+ def _transform_one_column(self, dataframe_one_column: pd.DataFrame) -> np.array:
+ """
+ Given one "original" feature (continuous), discretise it.
+
+ Args:
+ dataframe_one_column: dataframe with a single continuous feature, to be transformed into discrete
+ Returns:
+ Discrete feature, as an np.array of shape (len(df),)
+ """
+ cols = list(dataframe_one_column.columns)
+ if cols[0] in self.map_thresholds:
+ split_points = self.map_thresholds[cols[0]]
+ return np.digitize(dataframe_one_column.values.reshape(-1), split_points)
+
+ if cols[0] not in self.feat_names:
+ logging.warning(
+ "%s is not in feat_names. The column is left unchanged", cols[0]
+ )
+ return dataframe_one_column.values.reshape(-1)
+
+ def transform(self, data: pd.DataFrame) -> np.array:
+ """
+ Given one "original" dataframe, discretise it.
+
+ Args:
+ data: dataframe with continuous features, to be transformed into discrete
+ Returns:
+ discretised version of the input data
+ """
+ outputs = {}
+ for col in data.columns:
+ outputs[col] = self._transform_one_column(data[[col]])
+
+ transformed_df = pd.DataFrame.from_dict(outputs)
+ return transformed_df
+
+ def fit_transform(self, *args, **kwargs):
+ """
+ Raises:
+ NotImplementedError: fit_transform is not implemented
+ """
+ raise NotImplementedError(
+ "fit_transform is not implemented. Please use .fit() and .transform() separately"
+ )