Skip to content

Commit

Permalink
Add sklearn binary classifier (mckinsey#90)
Browse files Browse the repository at this point in the history
* binary dtype folder and __init__

* dtype base class

* continuous dtype

* binary dtype

* update core

* make plural

* update interface for idx

* minor variable name change

* notears update

* python 3.5 support

* fix fstring

* remove categorical methods, doctrings

* formatting and docstrings

* remove redundant cat code

* isort

* indexerror check

* defensive check tests

* datatype loss tests

* more test coverage

* more tests and formatting

* fix test import

* remove double test

* linting

* docstring and pylint

* docstring fix

Co-authored-by: Zain Patel <zain.patel@quantumblack.com>

* fix long string

Co-authored-by: Zain Patel <zain.patel@quantumblack.com>

* docstring fix

Co-authored-by: Zain Patel <zain.patel@quantumblack.com>

* remove relative imports

* docstring fix

* dict comprehension

* list comprehension and neatness

* remove unuesd import to __init__

* fix test

* remove unused return interface

* new sklearn folder structure

* sklearn class outline

* new dtype interface

* docstring clarification

* inverse link function

* add binary f1score tests

* one datatype instane per feature

* rename dtype -> disttype, attach dists to nodes

* fix tests

* fix linting

* fix preserve node dtyper

* fix tests

* fix tests

* fix tests

* final docstring and test fixes

* lint fix

* test_fix, warning

* linting

* fix test

* fix tests

* reduce threshold of test

* docstring clarification

* _target_dist_type injection

* docstring updates + clf fit outline

* old doctring deprecation

* clf predict_proba and predict

* return bugfix

* docstring update

* import fix, linting, clf fit finished

* args docstring and None schema

* raise better error

* black linting

* linting

* revert to public interface

* remove warning

* remove useless supression and import

* add useless change to resolve merge conflict

* update inits

* standardization and data reconstruction

* remove unused imports

* fix clf .precit()

* regressor fit_predict

* remove useless regressor predict

* test import fix

* fix warnings

* pass series name thru

* fix schema pass thru

* better dict comprehension

Co-authored-by: Zain Patel <zain.patel@quantumblack.com>

* import and comment fixes

* update to .format()

* fig sklearn is fitted test

* more dtype schema insertion

* DAGRegressor test fix

* dag regressor test

* more linting

* big test restructure

* combined test suite

* error string update

* more test coverage

* linting and isort

* move test to combined test

* return float64 preds

* moar clf tests

* remove untestable (multiclass) code

* class number error test

* balck reformat

* docstrings, pylint

* fix test bug

* standard scaler for _base

* pull classes direct from LabelEncoder

* update tutorial

Co-authored-by: Zain Patel <zain.patel@quantumblack.com>
  • Loading branch information
angeldrothqb and mzjp2 authored Sep 18, 2020
1 parent 1701a45 commit 04bceb4
Show file tree
Hide file tree
Showing 13 changed files with 862 additions and 187 deletions.
11 changes: 9 additions & 2 deletions causalnex/structure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,14 @@
``causalnex.structure`` provides functionality to define or learn structure.
"""

__all__ = ["StructureModel", "notears", "dynotears", "data_generators", "DAGRegressor"]
__all__ = [
"StructureModel",
"notears",
"dynotears",
"data_generators",
"DAGRegressor",
"DAGClassifier",
]

from .sklearn import DAGRegressor
from .pytorch import DAGClassifier, DAGRegressor
from .structuremodel import StructureModel
3 changes: 2 additions & 1 deletion causalnex/structure/pytorch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
``causalnex.structure.pytorch`` provides functionality to define or learn structure using pytorch.
"""

__all__ = ["from_numpy", "from_pandas", "NotearsMLP"]
__all__ = ["from_numpy", "from_pandas", "NotearsMLP", "DAGRegressor", "DAGClassifier"]

from .core import NotearsMLP
from .notears import from_numpy, from_pandas
from .sklearn import DAGClassifier, DAGRegressor
25 changes: 25 additions & 0 deletions causalnex/structure/pytorch/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,31 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # [n, d] -> [n, d]
x = x.squeeze(dim=2) # [n, d]
return x

def reconstruct_data(self, X: np.ndarray) -> np.ndarray:
"""
Performs X_hat reconstruction,
then converts latent space to original data space via link function.
Args:
X: input data used to reconstruct
Returns:
reconstructed data
"""

with torch.no_grad():
# convert the predict data to pytorch tensor
X = torch.from_numpy(X).float().to(self.device)

# perform forward reconstruction
X_hat = self(X)

# recover each one of the latent space projections
for dist_type in self.dist_types:
X_hat = dist_type.inverse_link_function(X_hat)

return np.asarray(X_hat.cpu().detach().numpy().astype(np.float64))

@property
def bias(self) -> Union[np.ndarray, None]:
"""
Expand Down
16 changes: 16 additions & 0 deletions causalnex/structure/pytorch/dist_type/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,19 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
Scalar pytorch tensor of the reconstruction loss between X and X_hat.
"""
raise NotImplementedError("Must implement the loss() method")

@abstractmethod
def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
"""
Convert the transformed data from the latent space to the original dtype
using the inverse link function.
Args:
X_hat: Reconstructed data in the latent space.
Returns:
Modified X_hat.
MUST be same shape as passed in data.
Projects the self.idx column from the latent space to the dist_type space.
"""
raise NotImplementedError("Must implement the inverse_link_function() method")
16 changes: 15 additions & 1 deletion causalnex/structure/pytorch/dist_type/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,20 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
return nn.functional.binary_cross_entropy_with_logits(
input=X_hat[:, self.idx],
target=X[:, self.idx],
reduce=True,
reduction="mean",
)

def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
"""
Inverse-logit (sigmoid) inverse link function for binary data.
Args:
X_hat: Reconstructed data in the latent space.
Returns:
Modified X_hat.
MUST be same shape as passed in data.
Projects the self.idx column from the latent space to the dist_type space.
"""
X_hat[:, self.idx] = torch.sigmoid(X_hat[:, self.idx])
return X_hat
14 changes: 14 additions & 0 deletions causalnex/structure/pytorch/dist_type/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,17 @@ def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor:
return (0.5 / X.shape[0]) * torch.sum(
(X_hat[:, self.idx] - X[:, self.idx]) ** 2
)

def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor:
"""
Identity inverse link function for continuous data.
Args:
X_hat: Reconstructed data in the latent space.
Returns:
Modified X_hat.
MUST be same shape as passed in data.
Projects the self.idx column from the latent space to the dist_type space.
"""
return X_hat
35 changes: 35 additions & 0 deletions causalnex/structure/pytorch/sklearn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
# or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.
"""
``causalnex.structure.pytorch.sklearn`` provides sklearn style functionality to NOTEARS.
"""

__all__ = ["DAGRegressor", "DAGClassifier"]

from .clf import DAGClassifier
from .reg import DAGRegressor
Original file line number Diff line number Diff line change
Expand Up @@ -26,57 +26,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains the implementation of ``DAGRegressor``.
This module contains the implementation of ``DAGBase``.
``DAGRegressor`` is a class which wraps the StructureModel in an sklearn interface for regression.
``DAGBase`` is a class which provides an interface and common function for sklearn style NOTEARS functions.
"""

import copy
import warnings
from typing import Iterable, List, Union
from abc import ABCMeta, abstractmethod
from typing import Dict, Iterable, List, Union

import numpy as np
import pandas as pd
import torch
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted, check_X_y

from causalnex.plots import EDGE_STYLE, NODE_STYLE, plot_structure
from causalnex.structure.pytorch import notears


class DAGRegressor(
BaseEstimator, RegressorMixin
class DAGBase(
BaseEstimator, metaclass=ABCMeta
): # pylint: disable=too-many-instance-attributes
"""
Regressor wrapper of the StructureModel.
Base class for all sklearn wrappers of the StructureModel.
Implements the sklearn .fit and .predict interface.
Currently only supports linear NOTEARS fitting by the DAG.
Example:
::
>>> from causalnex.sklearn import DAGRegressor
>>>
>>> smr = DAGRegressor(threshold=0.1)
>>> smr.fit(X_train, y_train)
>>>
>>> y_preds = smr.predict(X_test)
>>> type(y_preds)
np.ndarray
>>>
>>> type(smr.feature_importances_)
np.ndarray
::
Attributes:
feature_importances_ (np.ndarray): An array of edge weights corresponding
positionally to the feature X.
"""

# pylint: disable=too-many-arguments
def __init__(
self,
dist_type_schema: Dict[Union[str, int], str] = None,
alpha: float = 0.0,
beta: float = 0.0,
fit_intercept: bool = True,
Expand All @@ -92,6 +72,12 @@ def __init__(
):
"""
Args:
dist_type_schema: The dist type schema corresponding to the X data passed to fit or predict.
It maps the pandas column name in X to the string alias of a dist type.
If X is a np.ndarray, it maps the positional index to the string alias of a dist type.
A list of alias names can be found in ``dist_type/__init__.py``.
If None, assumes that all data in X is continuous.
alpha: l1 loss weighting. When using nonlinear layers this is only applied
to the first layer.
Expand Down Expand Up @@ -140,6 +126,7 @@ def __init__(
self.beta = beta
self.fit_intercept = fit_intercept
self.hidden_layer_units = hidden_layer_units
self.dist_type_schema = dist_type_schema
self.threshold = threshold
self.tabu_edges = tabu_edges
self.tabu_parent_nodes = tabu_parent_nodes
Expand All @@ -160,31 +147,56 @@ def __init__(
self.enforce_dag = enforce_dag
self.standardize = standardize

def fit(
self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]
) -> "DAGRegressor":
@abstractmethod
def _target_dist_type(self) -> str:
"""
NOTE:
When extending this class override this method to return a dist_type alias
"""
raise NotImplementedError("Must implement _target_dist_type()")

def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]):
"""
Fits the sm model using the concat of X and y.
"""

# defensive X, y checks
check_X_y(X, y, y_numeric=True)

# force as DataFrame and Series (for later calculations)
# force X, y to DataFrame, Series for later calculations
X = pd.DataFrame(X)
y = pd.Series(y)
# force name so that name != None (causes errors in notears)
y.name = y.name or "__target"

# if self.dist_type_schema is None, assume all columns are continuous
dist_type_schema = self.dist_type_schema or {col: "cont" for col in X.columns}

if self.standardize:
self.ss_X = StandardScaler()
self.ss_y = StandardScaler()
X = pd.DataFrame(self.ss_X.fit_transform(X), columns=X.columns)
y = pd.Series(
self.ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1),
name=y.name,
# only standardize the continuous dist type columns.
self.continuous_col_idxs = [
X.columns.get_loc(col)
for col, alias in dist_type_schema.items()
if alias == "cont"
]

# copy X to prevet changes to underlying array data
X = X.copy()
self._ss_X = StandardScaler()
X.iloc[:, self.continuous_col_idxs] = self._ss_X.fit_transform(
X.iloc[:, self.continuous_col_idxs]
)

# if its a continuous target also standardize
if self._target_dist_type() == "cont":
y = y.copy()
self._ss_y = StandardScaler()
y[:] = self._ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1)

# add the target to the dist_type_schema
# NOTE: this must be done AFTER standardize
dist_type_schema[y.name] = self._target_dist_type()

# preserve the feature and target colnames
self._features = tuple(X.columns)
self._target = y.name
Expand All @@ -203,6 +215,7 @@ def fit(
# fit the structured model
self.graph_ = notears.from_pandas(
X,
dist_type_schema=dist_type_schema,
lasso_beta=self.alpha,
ridge_beta=self.beta,
hidden_layer_units=self.hidden_layer_units,
Expand All @@ -220,42 +233,39 @@ def fit(

return self

def _predict_from_parents(self, X: Union[pd.DataFrame, np.ndarray]):

# extract the base solver
structure_learner = self.graph_.graph["structure_learner"]

# convert the predict data to pytorch tensor
X = torch.from_numpy(X).float().to(structure_learner.device)
# need to concat y onto X so that the dimensions are the same
y = torch.zeros(X.shape[0], 1).float().to(structure_learner.device)
X = torch.cat([X, y], dim=1)

# perform forward reconstruction
X_hat = structure_learner(X)

# FUTURE NOTE: with dtypes the projection from latent -> dtype goes here

# extract the desired y column, return as array
y_pred = X_hat[:, -1]
return y_pred.cpu().detach().numpy()

def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""
Get the predictions of the structured model.
This is done by multiplying the edge weights with the feature i.e. X @ W
Uses the fitted NOTEARS algorithm to reconstruct y from known X data.
Returns:
Predicted y values for each row of X.
"""
# force convert to ndarray
X = np.asarray(X)
if self.standardize:
X = self.ss_X.transform(X)
X = X.copy()
X[:, self.continuous_col_idxs] = self._ss_X.transform(
X[:, self.continuous_col_idxs]
)

# insert dummy y column
y_fill = np.zeros(shape=(X.shape[0], 1))
X = np.hstack([X, y_fill])

# check that the model has been fit
check_is_fitted(self, "graph_")

y_pred = np.asarray(self._predict_from_parents(X))
if self.standardize:
y_pred = self.ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
# extract the base solver
structure_learner = self.graph_.graph["structure_learner"]
# use base solver to reconstruct data
X_hat = structure_learner.reconstruct_data(X)
# pull off reconstructed y column
y_pred = X_hat[:, -1]

# inverse-standardize
if self.standardize and self._target_dist_type() == "cont":
y_pred = self._ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

return y_pred

def get_edges_to_node(self, name: str, data: str = "weight") -> pd.Series:
Expand Down Expand Up @@ -321,9 +331,7 @@ def plot_dag(self, enforce_dag: bool = False, filename: str = "./graph.png"):
# pylint: disable=import-outside-toplevel
from IPython.display import Image
except ImportError as e:
raise ImportError(
"DAGRegressor.plot_dag method requires IPython installed."
) from e
raise ImportError("plot_dag method requires IPython installed.") from e

check_is_fitted(self, "graph_")

Expand Down
Loading

0 comments on commit 04bceb4

Please sign in to comment.