Add sklearn binary classifier (mckinsey#90)

* binary dtype folder and __init__ * dtype base class * continuous dtype * binary dtype * update core * make plural * update interface for idx * minor variable name change * notears update * python 3.5 support * fix fstring * remove categorical methods, doctrings * formatting and docstrings * remove redundant cat code * isort * indexerror check * defensive check tests * datatype loss tests * more test coverage * more tests and formatting * fix test import * remove double test * linting * docstring and pylint * docstring fix Co-authored-by: Zain Patel <zain.patel@quantumblack.com> * fix long string Co-authored-by: Zain Patel <zain.patel@quantumblack.com> * docstring fix Co-authored-by: Zain Patel <zain.patel@quantumblack.com> * remove relative imports * docstring fix * dict comprehension * list comprehension and neatness * remove unuesd import to __init__ * fix test * remove unused return interface * new sklearn folder structure * sklearn class outline * new dtype interface * docstring clarification * inverse link function * add binary f1score tests * one datatype instane per feature * rename dtype -> disttype, attach dists to nodes * fix tests * fix linting * fix preserve node dtyper * fix tests * fix tests * fix tests * final docstring and test fixes * lint fix * test_fix, warning * linting * fix test * fix tests * reduce threshold of test * docstring clarification * _target_dist_type injection * docstring updates + clf fit outline * old doctring deprecation * clf predict_proba and predict * return bugfix * docstring update * import fix, linting, clf fit finished * args docstring and None schema * raise better error * black linting * linting * revert to public interface * remove warning * remove useless supression and import * add useless change to resolve merge conflict * update inits * standardization and data reconstruction * remove unused imports * fix clf .precit() * regressor fit_predict * remove useless regressor predict * test import fix * fix warnings * pass series name thru * fix schema pass thru * better dict comprehension Co-authored-by: Zain Patel <zain.patel@quantumblack.com> * import and comment fixes * update to .format() * fig sklearn is fitted test * more dtype schema insertion * DAGRegressor test fix * dag regressor test * more linting * big test restructure * combined test suite * error string update * more test coverage * linting and isort * move test to combined test * return float64 preds * moar clf tests * remove untestable (multiclass) code * class number error test * balck reformat * docstrings, pylint * fix test bug * standard scaler for _base * pull classes direct from LabelEncoder * update tutorial Co-authored-by: Zain Patel <zain.patel@quantumblack.com>
vishalbelsare · Sep 18, 2020 · 04bceb4 · 04bceb4
1 parent 1701a45
commit 04bceb4
Show file tree

Hide file tree

Showing 13 changed files with 862 additions and 187 deletions.
diff --git a/causalnex/structure/__init__.py b/causalnex/structure/__init__.py
@@ -30,7 +30,14 @@
 ``causalnex.structure`` provides functionality to define or learn structure.
 """
 
-__all__ = ["StructureModel", "notears", "dynotears", "data_generators", "DAGRegressor"]
+__all__ = [
+ "StructureModel",
+ "notears",
+ "dynotears",
+ "data_generators",
+ "DAGRegressor",
+ "DAGClassifier",
+]
 
-from .sklearn import DAGRegressor
+from .pytorch import DAGClassifier, DAGRegressor
 from .structuremodel import StructureModel
diff --git a/causalnex/structure/pytorch/__init__.py b/causalnex/structure/pytorch/__init__.py
@@ -30,7 +30,8 @@
 ``causalnex.structure.pytorch`` provides functionality to define or learn structure using pytorch.
 """
 
-__all__ = ["from_numpy", "from_pandas", "NotearsMLP"]
+__all__ = ["from_numpy", "from_pandas", "NotearsMLP", "DAGRegressor", "DAGClassifier"]
 
 from .core import NotearsMLP
 from .notears import from_numpy, from_pandas
+from .sklearn import DAGClassifier, DAGRegressor
diff --git a/causalnex/structure/pytorch/core.py b/causalnex/structure/pytorch/core.py
@@ -181,6 +181,31 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # [n, d] -> [n, d]
  x = x.squeeze(dim=2) # [n, d]
  return x
 
+ def reconstruct_data(self, X: np.ndarray) -> np.ndarray:
+ """
+ Performs X_hat reconstruction,
+ then converts latent space to original data space via link function.
+
+ Args:
+ X: input data used to reconstruct
+
+ Returns:
+ reconstructed data
+ """
+
+ with torch.no_grad():
+ # convert the predict data to pytorch tensor
+ X = torch.from_numpy(X).float().to(self.device)
+
+ # perform forward reconstruction
+ X_hat = self(X)
+
+ # recover each one of the latent space projections
+ for dist_type in self.dist_types:
+ X_hat = dist_type.inverse_link_function(X_hat)
+
+ return np.asarray(X_hat.cpu().detach().numpy().astype(np.float64))
+
  @property
  def bias(self) -> Union[np.ndarray, None]:
  """

diff --git a/causalnex/structure/pytorch/dist_type/_base.py b/causalnex/structure/pytorch/dist_type/_base.py
@@ -61,3 +61,19 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
  Scalar pytorch tensor of the reconstruction loss between X and X_hat.
  """
  raise NotImplementedError("Must implement the loss() method")
+
+ @abstractmethod
+ def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
+ """
+ Convert the transformed data from the latent space to the original dtype
+ using the inverse link function.
+
+ Args:
+ X_hat: Reconstructed data in the latent space.
+
+ Returns:
+ Modified X_hat.
+ MUST be same shape as passed in data.
+ Projects the self.idx column from the latent space to the dist_type space.
+ """
+ raise NotImplementedError("Must implement the inverse_link_function() method")
diff --git a/causalnex/structure/pytorch/dist_type/binary.py b/causalnex/structure/pytorch/dist_type/binary.py
@@ -58,6 +58,20 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
  return nn.functional.binary_cross_entropy_with_logits(
  input=X_hat[:, self.idx],
  target=X[:, self.idx],
- reduce=True,
  reduction="mean",
  )
+
+ def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
+ """
+ Inverse-logit (sigmoid) inverse link function for binary data.
+
+ Args:
+ X_hat: Reconstructed data in the latent space.
+
+ Returns:
+ Modified X_hat.
+ MUST be same shape as passed in data.
+ Projects the self.idx column from the latent space to the dist_type space.
+ """
+ X_hat[:, self.idx] = torch.sigmoid(X_hat[:, self.idx])
+ return X_hat
diff --git a/causalnex/structure/pytorch/dist_type/continuous.py b/causalnex/structure/pytorch/dist_type/continuous.py
@@ -54,3 +54,17 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
  return (0.5 / X.shape[0]) * torch.sum(
  (X_hat[:, self.idx] - X[:, self.idx]) ** 2
  )
+
+ def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
+ """
+ Identity inverse link function for continuous data.
+
+ Args:
+ X_hat: Reconstructed data in the latent space.
+
+ Returns:
+ Modified X_hat.
+ MUST be same shape as passed in data.
+ Projects the self.idx column from the latent space to the dist_type space.
+ """
+ return X_hat
diff --git a/causalnex/structure/pytorch/sklearn/__init__.py b/causalnex/structure/pytorch/sklearn/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
+# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
+# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
+# (either separately or in combination, "QuantumBlack Trademarks") are
+# trademarks of QuantumBlack. The License does not grant you any right or
+# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
+# Trademarks or any confusingly similar mark as a trademark for your product,
+# or use the QuantumBlack Trademarks in any other manner that might cause
+# confusion in the marketplace, including but not limited to in advertising,
+# on websites, or on software.
+#
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+``causalnex.structure.pytorch.sklearn`` provides sklearn style functionality to NOTEARS.
+"""
+
+__all__ = ["DAGRegressor", "DAGClassifier"]
+
+from .clf import DAGClassifier
+from .reg import DAGRegressor
diff --git a/causalnex/structure/sklearn.py → causalnex/structure/pytorch/sklearn/_base.py b/causalnex/structure/sklearn.py → causalnex/structure/pytorch/sklearn/_base.py
@@ -26,57 +26,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This module contains the implementation of ``DAGRegressor``.
+This module contains the implementation of ``DAGBase``.
 
-``DAGRegressor`` is a class which wraps the StructureModel in an sklearn interface for regression.
+``DAGBase`` is a class which provides an interface and common function for sklearn style NOTEARS functions.
 """
-
 import copy
 import warnings
-from typing import Iterable, List, Union
+from abc import ABCMeta, abstractmethod
+from typing import Dict, Iterable, List, Union
 
 import numpy as np
 import pandas as pd
-import torch
-from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.base import BaseEstimator
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_is_fitted, check_X_y
 
 from causalnex.plots import EDGE_STYLE, NODE_STYLE, plot_structure
 from causalnex.structure.pytorch import notears
 
 
-class DAGRegressor(
- BaseEstimator, RegressorMixin
+class DAGBase(
+ BaseEstimator, metaclass=ABCMeta
 ): # pylint: disable=too-many-instance-attributes
  """
- Regressor wrapper of the StructureModel.
+ Base class for all sklearn wrappers of the StructureModel.
  Implements the sklearn .fit and .predict interface.
- Currently only supports linear NOTEARS fitting by the DAG.
-
- Example:
- ::
- >>> from causalnex.sklearn import DAGRegressor
- >>>
- >>> smr = DAGRegressor(threshold=0.1)
- >>> smr.fit(X_train, y_train)
- >>>
- >>> y_preds = smr.predict(X_test)
- >>> type(y_preds)
- np.ndarray
- >>>
- >>> type(smr.feature_importances_)
- np.ndarray
- ::
-
- Attributes:
- feature_importances_ (np.ndarray): An array of edge weights corresponding
- positionally to the feature X.
  """
 
  # pylint: disable=too-many-arguments
  def __init__(
  self,
+ dist_type_schema: Dict[Union[str, int], str] = None,
  alpha: float = 0.0,
  beta: float = 0.0,
  fit_intercept: bool = True,
@@ -92,6 +72,12 @@ def __init__(
  ):
  """
  Args:
+ dist_type_schema: The dist type schema corresponding to the X data passed to fit or predict.
+ It maps the pandas column name in X to the string alias of a dist type.
+ If X is a np.ndarray, it maps the positional index to the string alias of a dist type.
+ A list of alias names can be found in ``dist_type/__init__.py``.
+ If None, assumes that all data in X is continuous.
+
  alpha: l1 loss weighting. When using nonlinear layers this is only applied
  to the first layer.
 
@@ -140,6 +126,7 @@ def __init__(
  self.beta = beta
  self.fit_intercept = fit_intercept
  self.hidden_layer_units = hidden_layer_units
+ self.dist_type_schema = dist_type_schema
  self.threshold = threshold
  self.tabu_edges = tabu_edges
  self.tabu_parent_nodes = tabu_parent_nodes
@@ -160,31 +147,56 @@ def __init__(
  self.enforce_dag = enforce_dag
  self.standardize = standardize
 
- def fit(
- self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]
- ) -> "DAGRegressor":
+ @abstractmethod
+ def _target_dist_type(self) -> str:
+ """
+ NOTE:
+ When extending this class override this method to return a dist_type alias
+ """
+ raise NotImplementedError("Must implement _target_dist_type()")
+
+ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]):
  """
  Fits the sm model using the concat of X and y.
  """
 
  # defensive X, y checks
  check_X_y(X, y, y_numeric=True)
 
- # force as DataFrame and Series (for later calculations)
+ # force X, y to DataFrame, Series for later calculations
  X = pd.DataFrame(X)
  y = pd.Series(y)
  # force name so that name != None (causes errors in notears)
  y.name = y.name or "__target"
 
+ # if self.dist_type_schema is None, assume all columns are continuous
+ dist_type_schema = self.dist_type_schema or {col: "cont" for col in X.columns}
+
  if self.standardize:
- self.ss_X = StandardScaler()
- self.ss_y = StandardScaler()
- X = pd.DataFrame(self.ss_X.fit_transform(X), columns=X.columns)
- y = pd.Series(
- self.ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1),
- name=y.name,
+ # only standardize the continuous dist type columns.
+ self.continuous_col_idxs = [
+ X.columns.get_loc(col)
+ for col, alias in dist_type_schema.items()
+ if alias == "cont"
+ ]
+
+ # copy X to prevet changes to underlying array data
+ X = X.copy()
+ self._ss_X = StandardScaler()
+ X.iloc[:, self.continuous_col_idxs] = self._ss_X.fit_transform(
+ X.iloc[:, self.continuous_col_idxs]
  )
 
+ # if its a continuous target also standardize
+ if self._target_dist_type() == "cont":
+ y = y.copy()
+ self._ss_y = StandardScaler()
+ y[:] = self._ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1)
+
+ # add the target to the dist_type_schema
+ # NOTE: this must be done AFTER standardize
+ dist_type_schema[y.name] = self._target_dist_type()
+
  # preserve the feature and target colnames
  self._features = tuple(X.columns)
  self._target = y.name
@@ -203,6 +215,7 @@ def fit(
  # fit the structured model
  self.graph_ = notears.from_pandas(
  X,
+ dist_type_schema=dist_type_schema,
  lasso_beta=self.alpha,
  ridge_beta=self.beta,
  hidden_layer_units=self.hidden_layer_units,
@@ -220,42 +233,39 @@ def fit(
 
  return self
 
- def _predict_from_parents(self, X: Union[pd.DataFrame, np.ndarray]):
-
- # extract the base solver
- structure_learner = self.graph_.graph["structure_learner"]
-
- # convert the predict data to pytorch tensor
- X = torch.from_numpy(X).float().to(structure_learner.device)
- # need to concat y onto X so that the dimensions are the same
- y = torch.zeros(X.shape[0], 1).float().to(structure_learner.device)
- X = torch.cat([X, y], dim=1)
-
- # perform forward reconstruction
- X_hat = structure_learner(X)
-
- # FUTURE NOTE: with dtypes the projection from latent -> dtype goes here
-
- # extract the desired y column, return as array
- y_pred = X_hat[:, -1]
- return y_pred.cpu().detach().numpy()
-
  def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
  """
- Get the predictions of the structured model.
- This is done by multiplying the edge weights with the feature i.e. X @ W
+ Uses the fitted NOTEARS algorithm to reconstruct y from known X data.
+
+ Returns:
+ Predicted y values for each row of X.
  """
  # force convert to ndarray
  X = np.asarray(X)
  if self.standardize:
- X = self.ss_X.transform(X)
+ X = X.copy()
+ X[:, self.continuous_col_idxs] = self._ss_X.transform(
+ X[:, self.continuous_col_idxs]
+ )
+
+ # insert dummy y column
+ y_fill = np.zeros(shape=(X.shape[0], 1))
+ X = np.hstack([X, y_fill])
 
  # check that the model has been fit
  check_is_fitted(self, "graph_")
 
- y_pred = np.asarray(self._predict_from_parents(X))
- if self.standardize:
- y_pred = self.ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
+ # extract the base solver
+ structure_learner = self.graph_.graph["structure_learner"]
+ # use base solver to reconstruct data
+ X_hat = structure_learner.reconstruct_data(X)
+ # pull off reconstructed y column
+ y_pred = X_hat[:, -1]
+
+ # inverse-standardize
+ if self.standardize and self._target_dist_type() == "cont":
+ y_pred = self._ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
+
  return y_pred
 
  def get_edges_to_node(self, name: str, data: str = "weight") -> pd.Series:
@@ -321,9 +331,7 @@ def plot_dag(self, enforce_dag: bool = False, filename: str = "./graph.png"):
  # pylint: disable=import-outside-toplevel
  from IPython.display import Image
  except ImportError as e:
- raise ImportError(
- "DAGRegressor.plot_dag method requires IPython installed."
- ) from e
+ raise ImportError("plot_dag method requires IPython installed.") from e
 
  check_is_fitted(self, "graph_")