From aefa6e9fb767b4aef8230cce79f937c22ed4273d Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Mon, 15 Jun 2020 11:13:33 -0400 Subject: [PATCH] Enable model serialization --- econml/_ortho_learner.py | 14 +- econml/_rlearner.py | 153 +++---- econml/dml.py | 40 +- econml/drlearner.py | 221 +++++----- econml/metalearners.py | 12 +- econml/ortho_forest.py | 6 +- econml/ortho_iv.py | 753 +++++++++++++++++---------------- econml/tests/test_dml.py | 8 + econml/tests/test_drlearner.py | 8 + econml/tests/test_ortho_iv.py | 8 + econml/utilities.py | 38 ++ 11 files changed, 678 insertions(+), 583 deletions(-) diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 65854a3ce..23844a7e7 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -29,7 +29,7 @@ class in this module implements the general logic in a very versatile way from warnings import warn from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose, inverse_onehot, broadcast_unit_treatments, reshape_treatmentwise_effects, - StatsModelsLinearRegression, LassoCVWrapper) + StatsModelsLinearRegression, _EncoderWrapper) from sklearn.model_selection import KFold, StratifiedKFold, check_cv from sklearn.linear_model import LinearRegression, LassoCV from sklearn.preprocessing import (PolynomialFeatures, LabelEncoder, OneHotEncoder, @@ -551,12 +551,10 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None): else: to_split = Z # just stratify on Z - z_ohe = OneHotEncoder(categories='auto', sparse=False) - Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))[:, 1:] + z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first') + Z = z_ohe.fit_transform(reshape(Z, (-1, 1))) self.z_transformer = FunctionTransformer( - func=(lambda Z: - z_ohe.transform( - reshape(z_enc.transform(Z.ravel()), (-1, 1)))[:, 1:]), + func=_EncoderWrapper(z_ohe, z_enc).encode, validate=False) else: # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not @@ -582,9 +580,7 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None): if self._discrete_treatment: self._d_t = shape(T)[1:] self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - reshape(T, (-1, 1)))), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) nuisances, fitted_models, fitted_inds, scores = _crossfit(self._model_nuisance, folds, diff --git a/econml/_rlearner.py b/econml/_rlearner.py index 6763d14db..153ff260e 100644 --- a/econml/_rlearner.py +++ b/econml/_rlearner.py @@ -34,6 +34,82 @@ from ._ortho_learner import _OrthoLearner +class _ModelNuisance: + """ + Nuisance model fits the model_y and model_t at fit time and at predict time + calculates the residual Y and residual T based on the fitted models and returns + the residuals as two nuisance parameters. + """ + + def __init__(self, model_y, model_t): + self._model_y = clone(model_y, safe=False) + self._model_t = clone(model_t, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert Z is None, "Cannot accept instrument!" + self._model_t.fit(X, W, T, sample_weight=sample_weight) + self._model_y.fit(X, W, Y, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_y, 'score'): + Y_score = self._model_y.score(X, W, Y, sample_weight=sample_weight) + else: + Y_score = None + if hasattr(self._model_t, 'score'): + T_score = self._model_t.score(X, W, T, sample_weight=sample_weight) + else: + T_score = None + return Y_score, T_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_y.predict(X, W) + T_pred = self._model_t.predict(X, W) + if (X is None) and (W is None): # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred.reshape(T.shape) + return Y_res, T_res + + +class _ModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final): + self._model_final = clone(model_final, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) + return self + + def predict(self, X=None): + return self._model_final.predict(X) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) + + class _RLearner(_OrthoLearner): """ Base class for CATE learners that residualize treatment and outcome and run residual on residual regression. @@ -202,82 +278,9 @@ def predict(self, X): def __init__(self, model_y, model_t, model_final, discrete_treatment, categories, n_splits, random_state): - class ModelNuisance: - """ - Nuisance model fits the model_y and model_t at fit time and at predict time - calculates the residual Y and residual T based on the fitted models and returns - the residuals as two nuisance parameters. - """ - - def __init__(self, model_y, model_t): - self._model_y = clone(model_y, safe=False) - self._model_t = clone(model_t, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert Z is None, "Cannot accept instrument!" - self._model_t.fit(X, W, T, sample_weight=sample_weight) - self._model_y.fit(X, W, Y, sample_weight=sample_weight) - return self - - def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - if hasattr(self._model_y, 'score'): - Y_score = self._model_y.score(X, W, Y, sample_weight=sample_weight) - else: - Y_score = None - if hasattr(self._model_t, 'score'): - T_score = self._model_t.score(X, W, T, sample_weight=sample_weight) - else: - T_score = None - return Y_score, T_score - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_y.predict(X, W) - T_pred = self._model_t.predict(X, W) - if (X is None) and (W is None): # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred.reshape(T.shape) - return Y_res, T_res - - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final): - self._model_final = clone(model_final, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) - return self - def predict(self, X=None): - return self._model_final.predict(X) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred)**2) - - super().__init__(ModelNuisance(model_y, model_t), - ModelFinal(model_final), + super().__init__(_ModelNuisance(model_y, model_t), + _ModelFinal(model_final), discrete_treatment=discrete_treatment, discrete_instrument=False, # no instrument, so doesn't matter categories=categories, diff --git a/econml/dml.py b/econml/dml.py index 3b5986bb3..acf6d585f 100644 --- a/econml/dml.py +++ b/econml/dml.py @@ -37,7 +37,7 @@ import copy from warnings import warn from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose, inverse_onehot, - broadcast_unit_treatments, reshape_treatmentwise_effects, + broadcast_unit_treatments, reshape_treatmentwise_effects, add_intercept, StatsModelsLinearRegression, LassoCVWrapper, check_high_dimensional) from econml.sklearn_extensions.linear_model import MultiOutputDebiasedLasso, WeightedLassoCVWrapper from econml.sklearn_extensions.ensemble import SubsampledHonestForest @@ -126,14 +126,13 @@ def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick else: self._fit_cate_intercept = fit_cate_intercept if self._fit_cate_intercept: - add_intercept = FunctionTransformer(lambda F: - hstack([np.ones((F.shape[0], 1)), F]), - validate=True) + add_intercept_trans = FunctionTransformer(add_intercept, + validate=True) if featurizer: self._featurizer = Pipeline([('featurize', self._original_featurizer), - ('add_intercept', add_intercept)]) + ('add_intercept', add_intercept_trans)]) else: - self._featurizer = add_intercept + self._featurizer = add_intercept_trans else: self._featurizer = self._original_featurizer @@ -704,6 +703,21 @@ def fit(self, Y, T, X=None, W=None, sample_weight=None, sample_var=None, inferen return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, sample_var=None, inference=inference) +class _RandomFeatures(TransformerMixin): + def __init__(self, dim, bw, random_state): + self._dim = dim + self._bw = bw + self._random_state = check_random_state(random_state) + + def fit(self, X): + self.omegas = self._random_state.normal(0, 1 / self._bw, size=(shape(X)[1], self._dim)) + self.biases = self._random_state.uniform(0, 2 * np.pi, size=(1, self._dim)) + return self + + def transform(self, X): + return np.sqrt(2 / self._dim) * np.cos(np.matmul(X, self.omegas) + self.biases) + + class KernelDMLCateEstimator(DMLCateEstimator): """ A specialized version of the linear Double ML Estimator that uses random fourier features. @@ -764,21 +778,9 @@ class KernelDMLCateEstimator(DMLCateEstimator): def __init__(self, model_y=WeightedLassoCVWrapper(), model_t='auto', fit_cate_intercept=True, dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2, random_state=None): - class RandomFeatures(TransformerMixin): - def __init__(self, random_state): - self._random_state = check_random_state(random_state) - - def fit(self, X): - self.omegas = self._random_state.normal(0, 1 / bw, size=(shape(X)[1], dim)) - self.biases = self._random_state.uniform(0, 2 * np.pi, size=(1, dim)) - return self - - def transform(self, X): - return np.sqrt(2 / dim) * np.cos(np.matmul(X, self.omegas) + self.biases) - super().__init__(model_y=model_y, model_t=model_t, model_final=ElasticNetCV(fit_intercept=False), - featurizer=RandomFeatures(random_state), + featurizer=_RandomFeatures(dim, bw, random_state), fit_cate_intercept=fit_cate_intercept, discrete_treatment=discrete_treatment, categories=categories, diff --git a/econml/drlearner.py b/econml/drlearner.py index 289952f8c..421295cd4 100644 --- a/econml/drlearner.py +++ b/econml/drlearner.py @@ -48,6 +48,116 @@ def _filter_none_kwargs(**kwargs): return out_kwargs +class _ModelNuisance: + def __init__(self, model_propensity, model_regression, min_propensity): + self._model_propensity = model_propensity + self._model_regression = model_regression + self._min_propensity = min_propensity + + def _combine(self, X, W): + return np.hstack([arr for arr in [X, W] if arr is not None]) + + def fit(self, Y, T, X=None, W=None, *, sample_weight=None): + if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): + raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " + "instead got {1}.".format(len(X), Y.shape)) + if (X is None) and (W is None): + raise AttributeError("At least one of X or W has to not be None!") + if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + XW = self._combine(X, W) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) + self._model_propensity.fit(XW, inverse_onehot(T), **filtered_kwargs) + self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) + return self + + def score(self, Y, T, X=None, W=None, *, sample_weight=None): + XW = self._combine(X, W) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) + + if hasattr(self._model_propensity, 'score'): + propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) + else: + propensity_score = None + if hasattr(self._model_regression, 'score'): + regression_score = self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) + else: + regression_score = None + + return propensity_score, regression_score + + def predict(self, Y, T, X=None, W=None, *, sample_weight=None): + XW = self._combine(X, W) + propensities = np.maximum(self._model_propensity.predict_proba(XW), self._min_propensity) + n = T.shape[0] + Y_pred = np.zeros((T.shape[0], T.shape[1] + 1)) + T_counter = np.zeros(T.shape) + Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) + Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0] + for t in np.arange(T.shape[1]): + T_counter = np.zeros(T.shape) + T_counter[:, t] = 1 + Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) + Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1] + return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)) + + +class _ModelFinal: + # Coding Remark: The reasoning around the multitask_model_final could have been simplified if + # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want + # to allow even for model_final objects whose fit(X, y) can accept X=None + # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor + # checks that X is 2D array. + def __init__(self, model_final, featurizer, multitask_model_final): + self._model_final = clone(model_final, safe=False) + self._featurizer = clone(featurizer, safe=False) + self._multitask_model_final = multitask_model_final + return + + def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): + Y_pred, = nuisances + self.d_y = Y_pred.shape[1:-1] # track whether there's a Y dimension (must be a singleton) + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.fit_transform(X) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var) + if self._multitask_model_final: + ys = Y_pred[..., 1:] - Y_pred[..., [0]] # subtract control results from each other arm + if self.d_y: # need to squeeze out singleton so that we fit on 2D array + ys = ys.squeeze(1) + self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs) + else: + self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0], + **filtered_kwargs) + for t in np.arange(1, Y_pred.shape[-1])] + return self + + def predict(self, X=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + if self._multitask_model_final: + pred = self.model_cate.predict(X) + if self.d_y: # need to reintroduce singleton Y dimension + return pred[:, np.newaxis, :] + return pred + else: + preds = np.array([mdl.predict(X) for mdl in self.models_cate]) + return np.moveaxis(preds, 0, -1) # move treatment dim to end + + def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + Y_pred, = nuisances + if self._multitask_model_final: + return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2, + weights=sample_weight, axis=0)) + else: + return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] - + self.models_cate[t - 1].predict(X))**2, + weights=sample_weight, axis=0) + for t in np.arange(1, Y_pred.shape[-1])]) + + class DRLearner(_OrthoLearner): """ CATE estimator that uses doubly-robust correction techniques to account for @@ -257,116 +367,9 @@ def __init__(self, model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', m categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_propensity, model_regression): - self._model_propensity = model_propensity - self._model_regression = model_regression - - def _combine(self, X, W): - return np.hstack([arr for arr in [X, W] if arr is not None]) - - def fit(self, Y, T, X=None, W=None, *, sample_weight=None): - if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): - raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " - "instead got {1}.".format(len(X), Y.shape)) - if (X is None) and (W is None): - raise AttributeError("At least one of X or W has to not be None!") - if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))): - raise AttributeError("Provided crossfit folds contain training splits that " + - "don't contain all treatments") - XW = self._combine(X, W) - filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) - self._model_propensity.fit(XW, inverse_onehot(T), **filtered_kwargs) - self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) - return self - - def score(self, Y, T, X=None, W=None, *, sample_weight=None): - XW = self._combine(X, W) - filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) - - if hasattr(self._model_propensity, 'score'): - propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) - else: - propensity_score = None - if hasattr(self._model_regression, 'score'): - regression_score = self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) - else: - regression_score = None - - return propensity_score, regression_score - - def predict(self, Y, T, X=None, W=None, *, sample_weight=None): - XW = self._combine(X, W) - propensities = np.maximum(self._model_propensity.predict_proba(XW), min_propensity) - n = T.shape[0] - Y_pred = np.zeros((T.shape[0], T.shape[1] + 1)) - T_counter = np.zeros(T.shape) - Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) - Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0] - for t in np.arange(T.shape[1]): - T_counter = np.zeros(T.shape) - T_counter[:, t] = 1 - Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) - Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1] - return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)) - - class ModelFinal: - # Coding Remark: The reasoning around the multitask_model_final could have been simplified if - # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want - # to allow even for model_final objects whose fit(X, y) can accept X=None - # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor - # checks that X is 2D array. - def __init__(self, model_final, featurizer, multitask_model_final): - self._model_final = clone(model_final, safe=False) - self._featurizer = clone(featurizer, safe=False) - self._multitask_model_final = multitask_model_final - return - - def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): - Y_pred, = nuisances - self.d_y = Y_pred.shape[1:-1] # track whether there's a Y dimension (must be a singleton) - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.fit_transform(X) - filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var) - if self._multitask_model_final: - ys = Y_pred[..., 1:] - Y_pred[..., [0]] # subtract control results from each other arm - if self.d_y: # need to squeeze out singleton so that we fit on 2D array - ys = ys.squeeze(1) - self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs) - else: - self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0], - **filtered_kwargs) - for t in np.arange(1, Y_pred.shape[-1])] - return self - - def predict(self, X=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - if self._multitask_model_final: - pred = self.model_cate.predict(X) - if self.d_y: # need to reintroduce singleton Y dimension - return pred[:, np.newaxis, :] - return pred - else: - preds = np.array([mdl.predict(X) for mdl in self.models_cate]) - return np.moveaxis(preds, 0, -1) # move treatment dim to end - - def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - Y_pred, = nuisances - if self._multitask_model_final: - return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2, - weights=sample_weight, axis=0)) - else: - return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] - - self.models_cate[t - 1].predict(X))**2, - weights=sample_weight, axis=0) - for t in np.arange(1, Y_pred.shape[-1])]) - self._multitask_model_final = multitask_model_final - super().__init__(ModelNuisance(model_propensity, model_regression), - ModelFinal(model_final, featurizer, multitask_model_final), + super().__init__(_ModelNuisance(model_propensity, model_regression, min_propensity), + _ModelFinal(model_final, featurizer, multitask_model_final), n_splits=n_splits, discrete_treatment=True, discrete_instrument=False, # no instrument, so doesn't matter categories=categories, diff --git a/econml/metalearners.py b/econml/metalearners.py index edfc8bb3b..31a2090d9 100644 --- a/econml/metalearners.py +++ b/econml/metalearners.py @@ -16,7 +16,7 @@ from sklearn.utils import check_array, check_X_y from sklearn.preprocessing import OneHotEncoder, FunctionTransformer from .utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects, - inverse_onehot, transpose) + inverse_onehot, transpose, _EncoderWrapper) class TLearner(TreatmentExpansionMixin, LinearCateEstimator): @@ -40,8 +40,7 @@ def __init__(self, models, categories='auto'): categories = [categories] # OneHotEncoder expects a 2D array with features per column self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform(T.reshape(-1, 1))), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() @@ -129,7 +128,7 @@ def __init__(self, overall_model, categories='auto'): # We might want to revisit, though, since it's linearly determined by the others self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False) self.transformer = FunctionTransformer( - func=(lambda T: self._one_hot_encoder.transform(T.reshape(-1, 1))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder, drop_first=True).encode, validate=False) super().__init__() @@ -233,8 +232,7 @@ def __init__(self, models, categories = [categories] # OneHotEncoder expects a 2D array with features per column self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform(T.reshape(-1, 1))), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() @@ -357,7 +355,7 @@ def __init__(self, models, categories = [categories] # OneHotEncoder expects a 2D array with features per column self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: self._one_hot_encoder.transform(T.reshape(-1, 1))), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() diff --git a/econml/ortho_forest.py b/econml/ortho_forest.py index 61e30ebe0..e319c9f83 100644 --- a/econml/ortho_forest.py +++ b/econml/ortho_forest.py @@ -38,7 +38,8 @@ from .cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin from .causal_tree import CausalTree from .inference import Inference -from .utilities import reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, cross_product, inverse_onehot +from .utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, + cross_product, inverse_onehot, _EncoderWrapper) def _build_tree_in_parallel(Y, T, X, W, @@ -798,8 +799,7 @@ def fit(self, Y, T, X, W=None, inference=None): self.second_stage_nuisance_estimator = DiscreteTreatmentOrthoForest.nuisance_estimator_generator( self.propensity_model_final, self.model_Y_final, self.n_T, self.random_state, second_stage=True) self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform(T.reshape(-1, 1))), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) # Call `fit` from parent class return super().fit(Y, T, X, W=W, inference=inference) diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py index 7841a2ba1..0c417e114 100644 --- a/econml/ortho_iv.py +++ b/econml/ortho_iv.py @@ -21,7 +21,7 @@ from ._ortho_learner import _OrthoLearner from .dml import _FinalWrapper -from .utilities import (hstack, StatsModelsLinearRegression, inverse_onehot) +from .utilities import (hstack, StatsModelsLinearRegression, inverse_onehot, add_intercept) from .inference import StatsModelsInference from .cate_estimator import StatsModelsCateEstimatorMixin @@ -92,48 +92,50 @@ def predict(self, X, Z=None): return self._model.predict(self._combine(X, Z, n_samples, fitting=False)) +class _BaseDMLATEIVModelFinal: + def __init__(self): + self._first_stage = LinearRegression(fit_intercept=False) + self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False), + fit_cate_intercept=True, featurizer=None, use_weight_trick=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res, Z_res = nuisances + if Z_res.ndim == 1: + Z_res = Z_res.reshape(-1, 1) + # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res + T_res_pred = self._first_stage.fit(Z_res, T_res, + sample_weight=sample_weight).predict(Z_res) + # TODO: allow the final model to actually use X? Then we'd need to rename the class + # since we would actually be calculating a CATE rather than ATE. + self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight) + return self + + def predict(self, X=None): + # TODO: allow the final model to actually use X? + return self._model_final.predict(X=None) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res, Z_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + # TODO: allow the final model to actually use X? + effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) + + class _BaseDMLATEIV(_OrthoLearner): def __init__(self, model_nuisance, discrete_instrument=False, discrete_treatment=False, categories='auto', n_splits=2, random_state=None): - class ModelFinal: - def __init__(self): - self._first_stage = LinearRegression(fit_intercept=False) - self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False), - fit_cate_intercept=True, featurizer=None, use_weight_trick=False) - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res, Z_res = nuisances - if Z_res.ndim == 1: - Z_res = Z_res.reshape(-1, 1) - # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res - T_res_pred = self._first_stage.fit(Z_res, T_res, - sample_weight=sample_weight).predict(Z_res) - # TODO: allow the final model to actually use X? Then we'd need to rename the class - # since we would actually be calculating a CATE rather than ATE. - self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight) - return self - - def predict(self, X=None): - # TODO: allow the final model to actually use X? - return self._model_final.predict(X=None) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res, Z_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - # TODO: allow the final model to actually use X? - effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred) ** 2) - - super().__init__(model_nuisance, ModelFinal(), + + super().__init__(model_nuisance, _BaseDMLATEIVModelFinal(), discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, categories=categories, n_splits=n_splits, random_state=random_state) @@ -198,6 +200,48 @@ def score(self, Y, T, Z, W=None): return super().score(Y, T, W=W, Z=Z) +class _DMLATEIVModelNuisance: + def __init__(self, model_Y_X, model_T_X, model_Z_X): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_X = clone(model_T_X, safe=False) + self._model_Z_X = clone(model_Z_X, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + self._model_Y_X.fit(W, Y, sample_weight=sample_weight) + self._model_T_X.fit(W, T, sample_weight=sample_weight) + self._model_Z_X.fit(W, Z, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(W, Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_X, 'score'): + T_X_score = self._model_T_X.score(W, T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_Z_X, 'score'): + Z_X_score = self._model_Z_X.score(W, Z, sample_weight=sample_weight) + else: + Z_X_score = None + return Y_X_score, T_X_score, Z_X_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_Y_X.predict(W) + T_pred = self._model_T_X.predict(W) + Z_pred = self._model_Z_X.predict(W) + if W is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) + Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred.reshape(T.shape) + Z_res = Z - Z_pred.reshape(Z.shape) + return Y_res, T_res, Z_res + + class DMLATEIV(_BaseDMLATEIV): """ Implementation of the orthogonal/double ml method for ATE estimation with @@ -216,105 +260,157 @@ def __init__(self, model_Y_X, model_T_X, model_Z_X, discrete_treatment=False, discrete_instrument=False, categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_Y_X, model_T_X, model_Z_X): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_Z_X = clone(model_Z_X, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert X is None, "DML ATE IV does not accept features" - self._model_Y_X.fit(W, Y, sample_weight=sample_weight) - self._model_T_X.fit(W, T, sample_weight=sample_weight) - self._model_Z_X.fit(W, Z, sample_weight=sample_weight) - return self - - def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - if hasattr(self._model_Y_X, 'score'): - Y_X_score = self._model_Y_X.score(W, Y, sample_weight=sample_weight) - else: - Y_X_score = None - if hasattr(self._model_T_X, 'score'): - T_X_score = self._model_T_X.score(W, T, sample_weight=sample_weight) - else: - T_X_score = None - if hasattr(self._model_Z_X, 'score'): - Z_X_score = self._model_Z_X.score(W, Z, sample_weight=sample_weight) - else: - Z_X_score = None - return Y_X_score, T_X_score, Z_X_score - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(W) - T_pred = self._model_T_X.predict(W) - Z_pred = self._model_Z_X.predict(W) - if W is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) - Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred.reshape(T.shape) - Z_res = Z - Z_pred.reshape(Z.shape) - return Y_res, T_res, Z_res - super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False), - model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment), - model_Z_X=_FirstStageWrapper(model_Z_X, discrete_target=discrete_instrument)), + + super().__init__(_DMLATEIVModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False), + model_T_X=_FirstStageWrapper( + model_T_X, discrete_target=discrete_treatment), + model_Z_X=_FirstStageWrapper( + model_Z_X, discrete_target=discrete_instrument)), discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment, categories=categories, n_splits=n_splits, random_state=random_state) +class _ProjectedDMLATEIVModelNuisance: + def __init__(self, model_Y_X, model_T_X, model_T_XZ): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_X = clone(model_T_X, safe=False) + self._model_T_XZ = clone(model_T_XZ, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + self._model_Y_X.fit(W, Y, sample_weight=sample_weight) + self._model_T_X.fit(W, T, sample_weight=sample_weight) + self._model_T_XZ.fit(W, Z, T, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(W, Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_X, 'score'): + T_X_score = self._model_T_X.score(W, T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_T_XZ, 'score'): + T_XZ_score = self._model_T_XZ.score(W, Z, T, sample_weight=sample_weight) + else: + T_XZ_score = None + return Y_X_score, T_X_score, T_XZ_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_Y_X.predict(W) + TX_pred = self._model_T_X.predict(W) + TXZ_pred = self._model_T_XZ.predict(W, Z) + if W is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - TX_pred.reshape(T.shape) + Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) + return Y_res, T_res, Z_res + + class ProjectedDMLATEIV(_BaseDMLATEIV): def __init__(self, model_Y_X, model_T_X, model_T_XZ, discrete_treatment=False, discrete_instrument=False, categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_Y_X, model_T_X, model_T_XZ): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert X is None, "DML ATE IV does not accept features" - self._model_Y_X.fit(W, Y, sample_weight=sample_weight) - self._model_T_X.fit(W, T, sample_weight=sample_weight) - self._model_T_XZ.fit(W, Z, T, sample_weight=sample_weight) - return self - - def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - if hasattr(self._model_Y_X, 'score'): - Y_X_score = self._model_Y_X.score(W, Y, sample_weight=sample_weight) - else: - Y_X_score = None - if hasattr(self._model_T_X, 'score'): - T_X_score = self._model_T_X.score(W, T, sample_weight=sample_weight) - else: - T_X_score = None - if hasattr(self._model_T_XZ, 'score'): - T_XZ_score = self._model_T_XZ.score(W, Z, T, sample_weight=sample_weight) - else: - T_XZ_score = None - return Y_X_score, T_X_score, T_XZ_score - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(W) - TX_pred = self._model_T_X.predict(W) - TXZ_pred = self._model_T_XZ.predict(W, Z) - if W is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - TX_pred.reshape(T.shape) - Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) - return Y_res, T_res, Z_res - - super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False), - model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment), - model_T_XZ=_FirstStageWrapper(model_T_XZ, discrete_target=discrete_treatment)), - discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, - categories=categories, - n_splits=n_splits, random_state=random_state) + + super().__init__(_ProjectedDMLATEIVModelNuisance( + model_Y_X=_FirstStageWrapper( + model_Y_X, discrete_target=False), + model_T_X=_FirstStageWrapper( + model_T_X, discrete_target=discrete_treatment), + model_T_XZ=_FirstStageWrapper( + model_T_XZ, discrete_target=discrete_treatment)), + discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, + categories=categories, + n_splits=n_splits, random_state=random_state) + + +class _BaseDMLIVModelNuisance: + """ + Nuisance model fits the three models at fit time and at predict time + returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. + """ + + def __init__(self, model_Y_X, model_T_X, model_T_XZ): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_X = clone(model_T_X, safe=False) + self._model_T_XZ = clone(model_T_XZ, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + # TODO: would it be useful to extend to handle controls ala vanilla DML? + assert W is None, "DML IV does not accept controls" + self._model_Y_X.fit(X, Y, sample_weight=sample_weight) + self._model_T_X.fit(X, T, sample_weight=sample_weight) + self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(X, Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_X, 'score'): + T_X_score = self._model_T_X.score(X, T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_T_XZ, 'score'): + T_XZ_score = self._model_T_XZ.score(X, Z, T, sample_weight=sample_weight) + else: + T_XZ_score = None + return Y_X_score, T_X_score, T_XZ_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_Y_X.predict(X) + TXZ_pred = self._model_T_XZ.predict(X, Z) + TX_pred = self._model_T_X.predict(X) + if X is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) + return Y_res, T_res + + +class _BaseDMLIVModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final): + self._model_final = clone(model_final, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) + return self + + def predict(self, X=None): + return self._model_final.predict(X) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred)**2) class _BaseDMLIV(_OrthoLearner): @@ -389,88 +485,8 @@ class _BaseDMLIV(_OrthoLearner): def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, discrete_instrument=False, discrete_treatment=False, categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - """ - Nuisance model fits the three models at fit time and at predict time - returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. - """ - - def __init__(self, model_Y_X, model_T_X, model_T_XZ): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - # TODO: would it be useful to extend to handle controls ala vanilla DML? - assert W is None, "DML IV does not accept controls" - self._model_Y_X.fit(X, Y, sample_weight=sample_weight) - self._model_T_X.fit(X, T, sample_weight=sample_weight) - self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) - return self - - def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - if hasattr(self._model_Y_X, 'score'): - Y_X_score = self._model_Y_X.score(X, Y, sample_weight=sample_weight) - else: - Y_X_score = None - if hasattr(self._model_T_X, 'score'): - T_X_score = self._model_T_X.score(X, T, sample_weight=sample_weight) - else: - T_X_score = None - if hasattr(self._model_T_XZ, 'score'): - T_XZ_score = self._model_T_XZ.score(X, Z, T, sample_weight=sample_weight) - else: - T_XZ_score = None - return Y_X_score, T_X_score, T_XZ_score - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(X) - TXZ_pred = self._model_T_XZ.predict(X, Z) - TX_pred = self._model_T_X.predict(X) - if X is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) - return Y_res, T_res - - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final): - self._model_final = clone(model_final, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) - return self - - def predict(self, X=None): - return self._model_final.predict(X) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred)**2) - - super().__init__(ModelNuisance(model_Y_X, model_T_X, model_T_XZ), ModelFinal(model_final), + super().__init__(_BaseDMLIVModelNuisance(model_Y_X, model_T_X, model_T_XZ), + _BaseDMLIVModelFinal(model_final), discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, categories=categories, n_splits=n_splits, random_state=random_state) @@ -818,6 +834,111 @@ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, categories=categories) +class _BaseDRIVModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final, featurizer, + discrete_treatment, discrete_instrument, + fit_cate_intercept, cov_clip, opt_reweighted): + self._model_final = clone(model_final, safe=False) + self._fit_cate_intercept = fit_cate_intercept + self._original_featurizer = clone(featurizer, safe=False) + self._discrete_treatment = discrete_treatment + self._discrete_instrument = discrete_instrument + if self._fit_cate_intercept: + add_intercept_trans = FunctionTransformer(add_intercept, + validate=True) + if featurizer: + self._featurizer = Pipeline([('featurize', self._original_featurizer), + ('add_intercept', add_intercept_trans)]) + else: + self._featurizer = add_intercept_trans + else: + self._featurizer = self._original_featurizer + self._cov_clip = cov_clip + self._opt_reweighted = opt_reweighted + + def _effect_estimate(self, nuisances): + prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances] + + # Estimate final model of theta(X) by minimizing the square loss: + # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2 + # We clip the covariance so that it is bounded away from zero, so as to reduce variance + # at the expense of some small bias. For points with very small covariance we revert + # to the model-based preliminary estimate and do not add the correction term. + cov_sign = np.sign(cov) + cov_sign[cov_sign == 0] = 1 + clipped_cov = cov_sign * np.clip(np.abs(cov), + self._cov_clip, np.inf) + return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + self.d_y = Y.shape[1:] + self.d_t = nuisances[1].shape[1:] + self.d_z = nuisances[3].shape[1:] + + # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments, + # instruments, and outcomes + if self.d_y and self.d_y[0] > 2: + raise AttributeError("DRIV only supports a single outcome") + + if self.d_t and self.d_t[0] > 1: + if self._discrete_treatment: + raise AttributeError("DRIV only supports binary treatments") + else: + raise AttributeError("DRIV only supports single-dimensional continuous treatments") + + if self.d_z and self.d_z[0] > 1: + if self._discrete_instrument: + raise AttributeError("DRIV only supports binary instruments") + else: + raise AttributeError("DRIV only supports single-dimensional continuous instruments") + + theta_dr, clipped_cov = self._effect_estimate(nuisances) + + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.fit_transform(X) + # TODO: how do we incorporate the sample_weight and sample_var passed into this method + # as arguments? + if self._opt_reweighted: + self._model_final.fit(X, theta_dr, sample_weight=clipped_cov.ravel()**2) + else: + self._model_final.fit(X, theta_dr) + return self + + def predict(self, X=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + # TODO: is there a good way to incorporate the other nuisance terms in the score? + _, T_res, Y_res, _, _ = nuisances + + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) + + class _BaseDRIV(_OrthoLearner): """ @@ -892,112 +1013,19 @@ def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto', n_splits=2, random_state=None): - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final, featurizer, fit_cate_intercept): - self._model_final = clone(model_final, safe=False) - self._fit_cate_intercept = fit_cate_intercept - self._original_featurizer = clone(featurizer, safe=False) - if self._fit_cate_intercept: - add_intercept = FunctionTransformer(lambda F: - hstack([np.ones((F.shape[0], 1)), F]), - validate=True) - if featurizer: - self._featurizer = Pipeline([('featurize', self._original_featurizer), - ('add_intercept', add_intercept)]) - else: - self._featurizer = add_intercept - else: - self._featurizer = self._original_featurizer - - @staticmethod - def _effect_estimate(nuisances): - prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances] - - # Estimate final model of theta(X) by minimizing the square loss: - # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2 - # We clip the covariance so that it is bounded away from zero, so as to reduce variance - # at the expense of some small bias. For points with very small covariance we revert - # to the model-based preliminary estimate and do not add the correction term. - cov_sign = np.sign(cov) - cov_sign[cov_sign == 0] = 1 - clipped_cov = cov_sign * np.clip(np.abs(cov), - self.cov_clip, np.inf) - return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - self.d_y = Y.shape[1:] - self.d_t = nuisances[1].shape[1:] - self.d_z = nuisances[3].shape[1:] - - # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments, - # instruments, and outcomes - if self.d_y and self.d_y[0] > 2: - raise AttributeError("DRIV only supports a single outcome") - - if self.d_t and self.d_t[0] > 1: - if discrete_treatment: - raise AttributeError("DRIV only supports binary treatments") - else: - raise AttributeError("DRIV only supports single-dimensional continuous treatments") - - if self.d_z and self.d_z[0] > 1: - if discrete_instrument: - raise AttributeError("DRIV only supports binary instruments") - else: - raise AttributeError("DRIV only supports single-dimensional continuous instruments") - - theta_dr, clipped_cov = self._effect_estimate(nuisances) - - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.fit_transform(X) - # TODO: how do we incorporate the sample_weight and sample_var passed into this method - # as arguments? - if opt_reweighted: - self._model_final.fit(X, theta_dr, sample_weight=clipped_cov.ravel()**2) - else: - self._model_final.fit(X, theta_dr) - return self - - def predict(self, X=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - # TODO: is there a good way to incorporate the other nuisance terms in the score? - _, T_res, Y_res, _, _ = nuisances - - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred) ** 2) self.fit_cate_intercept = fit_cate_intercept self.bias_part_of_coef = fit_cate_intercept self.cov_clip = cov_clip self.opt_reweighted = opt_reweighted - super().__init__(nuisance_models, ModelFinal(model_final, featurizer, fit_cate_intercept), + super().__init__(nuisance_models, _BaseDRIVModelFinal(model_final, + featurizer, + discrete_treatment, + discrete_instrument, + fit_cate_intercept, + cov_clip, + opt_reweighted), discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment, categories=categories, n_splits=n_splits, random_state=random_state) @@ -1101,6 +1129,59 @@ def cate_feature_names(self, input_feature_names=None): raise AttributeError("Featurizer does not have a method: get_feature_names!") +class _IntentToTreatDRIVModelNuisance: + """ + Nuisance model fits the three models at fit time and at predict time + returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. + """ + + def __init__(self, model_Y_X, model_T_XZ, prel_model_effect): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_XZ = clone(model_T_XZ, safe=False) + self._prel_model_effect = clone(prel_model_effect, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert W is None, "IntentToTreatDRIV does not accept controls" + self._model_Y_X.fit(X, Y, sample_weight=sample_weight) + self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) + # we need to undo the one-hot encoding for calling effect, + # since it expects raw values + self._prel_model_effect.fit(Y, inverse_onehot(T), inverse_onehot(Z), X=X) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(X, Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_XZ, 'score'): + T_XZ_score = self._model_T_XZ.score(X, Z, T, sample_weight=sample_weight) + else: + T_XZ_score = None + if hasattr(self._prel_model_effect, 'score'): + # we need to undo the one-hot encoding for calling effect, + # since it expects raw values + effect_score = self._prel_model_effect.score(Y, inverse_onehot(T), inverse_onehot(Z), X=X) + else: + effect_score = None + + return Y_X_score, T_XZ_score, effect_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_Y_X.predict(X) + T_pred_zero = self._model_T_XZ.predict(X, np.zeros(Z.shape)) + T_pred_one = self._model_T_XZ.predict(X, np.ones(Z.shape)) + delta = (T_pred_one - T_pred_zero) / 2 + T_pred_mean = (T_pred_one + T_pred_zero) / 2 + prel_theta = self._prel_model_effect.effect(X) + if X is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred_mean.reshape(T.shape) + return prel_theta, T_res, Y_res, 2 * Z - 1, delta + + class _IntentToTreatDRIV(_BaseDRIV): """ Helper class for the DRIV algorithm for the intent-to-treat A/B test setting @@ -1118,59 +1199,9 @@ def __init__(self, model_Y_X, model_T_XZ, """ """ - class ModelNuisance: - """ - Nuisance model fits the three models at fit time and at predict time - returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. - """ - - def __init__(self, model_Y_X, model_T_XZ, prel_model_effect): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - self._prel_model_effect = clone(prel_model_effect, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert W is None, "IntentToTreatDRIV does not accept controls" - self._model_Y_X.fit(X, Y, sample_weight=sample_weight) - self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) - # we need to undo the one-hot encoding for calling effect, - # since it expects raw values - self._prel_model_effect.fit(Y, inverse_onehot(T), inverse_onehot(Z), X=X) - return self - - def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - if hasattr(self._model_Y_X, 'score'): - Y_X_score = self._model_Y_X.score(X, Y, sample_weight=sample_weight) - else: - Y_X_score = None - if hasattr(self._model_T_XZ, 'score'): - T_XZ_score = self._model_T_XZ.score(X, Z, T, sample_weight=sample_weight) - else: - T_XZ_score = None - if hasattr(self._prel_model_effect, 'score'): - # we need to undo the one-hot encoding for calling effect, - # since it expects raw values - effect_score = self._prel_model_effect.score(Y, inverse_onehot(T), inverse_onehot(Z), X=X) - else: - effect_score = None - - return Y_X_score, T_XZ_score, effect_score - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(X) - T_pred_zero = self._model_T_XZ.predict(X, np.zeros(Z.shape)) - T_pred_one = self._model_T_XZ.predict(X, np.ones(Z.shape)) - delta = (T_pred_one - T_pred_zero) / 2 - T_pred_mean = (T_pred_one + T_pred_zero) / 2 - prel_theta = self._prel_model_effect.effect(X) - if X is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred_mean.reshape(T.shape) - return prel_theta, T_res, Y_res, 2 * Z - 1, delta # TODO: check that Y, T, Z do not have multiple columns - super().__init__(ModelNuisance(model_Y_X, model_T_XZ, prel_model_effect), model_effect, + super().__init__(_IntentToTreatDRIVModelNuisance(model_Y_X, model_T_XZ, prel_model_effect), + model_effect, featurizer=featurizer, fit_cate_intercept=fit_cate_intercept, cov_clip=cov_clip, diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index fc1587238..8db38bf9d 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -3,6 +3,7 @@ import unittest import pytest +import pickle from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures @@ -129,6 +130,9 @@ def make_random(is_discrete, d): if not(multi) and d_y > 1: continue + # ensure we can serialize the unfit estimator + pickle.dumps(est) + for inf in infs: with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t, is_discrete=is_discrete, est=est, inf=inf): @@ -139,6 +143,10 @@ def make_random(is_discrete, d): continue est.fit(Y, T, X, W, inference=inf) + + # ensure we can pickle the fit estimator + pickle.dumps(est) + # make sure we can call the marginal_effect and effect methods const_marg_eff = est.const_marginal_effect(X) marg_eff = est.marginal_effect(T, X) diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index 89f6f2517..ce391849a 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -4,6 +4,7 @@ import numpy as np import unittest import pytest +import pickle from sklearn.base import TransformerMixin from numpy.random import normal, multivariate_normal, binomial from sklearn.exceptions import DataConversionWarning @@ -109,6 +110,9 @@ def make_random(is_discrete, d): model_final=StatsModelsLinearRegression(), multitask_model_final=True)]: + # ensure that we can serialize unfit estimator + pickle.dumps(est) + # TODO: add stratification to bootstrap so that we can use it even with discrete treatments infs = [None] if isinstance(est, LinearDRLearner): @@ -118,6 +122,10 @@ def make_random(is_discrete, d): with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t, is_discrete=is_discrete, est=est, inf=inf): est.fit(Y, T, X, W, inference=inf) + + # ensure that we can serialize fit estimator + pickle.dumps(est) + # make sure we can call the marginal_effect and effect methods const_marg_eff = est.const_marginal_effect( X) diff --git a/econml/tests/test_ortho_iv.py b/econml/tests/test_ortho_iv.py index 6aa11e7c3..5c2ab5fbf 100644 --- a/econml/tests/test_ortho_iv.py +++ b/econml/tests/test_ortho_iv.py @@ -3,6 +3,7 @@ import unittest import pytest +import pickle from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures @@ -133,6 +134,9 @@ def const_marg_eff_shape(n, d_x, d_y, d_t_final): if not(multi) and d_y > 1 or d_t > 1 or d_z > 1: continue + # ensure we can serialize unfit estimator + pickle.dumps(est) + for inf in infs: with self.subTest(d_z=d_z, d_x=d_q, d_y=d_y, d_t=d_t, discrete_t=discrete_t, discrete_z=discrete_z, @@ -174,6 +178,10 @@ def score(): const_marginal_effect_shape = const_marg_eff_shape(n, d_x, d_y, d_t_final) fit() + + # ensure we can serialize fit estimator + pickle.dumps(est) + # make sure we can call the marginal_effect and effect methods const_marg_eff = est.const_marginal_effect(X) marg_eff = est.marginal_effect(T, X) diff --git a/econml/utilities.py b/econml/utilities.py index f583a0b21..4343d7e90 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -411,6 +411,23 @@ def t(X): return _apply(t, X) +def add_intercept(X): + """ + Adds an intercept feature to an array by prepending a column of ones. + + Parameters + ---------- + X : array-like + Input array. Must be 2D. + + Returns + ------- + arr : ndarray + `X` with a column of ones prepended + """ + return hstack([np.ones((X.shape[0], 1)), X]) + + def reshape_Y_T(Y, T): """ Reshapes Y and T when Y.ndim = 2 and/or T.ndim = 1. @@ -1371,3 +1388,24 @@ def predict(self, XZ): @property def coef_(self): return np.concatenate((model.coef_ for model in self.models)) + + +class _EncoderWrapper: + """ + Wraps a OneHotEncoder (and optionally also a LabelEncoder). + + Useful mainly so that the `encode` method can be used in a FunctionTransformer, + which would otherwise need a lambda (which can't be pickled). + """ + + def __init__(self, one_hot_encoder, label_encoder=None, drop_first=False): + self._label_encoder = label_encoder + self._one_hot_encoder = one_hot_encoder + self._drop_first = drop_first + + def encode(self, arr): + if self._label_encoder: + arr = self._label_encoder.transform(arr.ravel()) + + result = self._one_hot_encoder.transform(reshape(arr, (-1, 1))) + return result[:, 1:] if self._drop_first else result