Skip to content

Commit

Permalink
Replace all occurrences of get Pandas' get_dummies() with skLearn One…
Browse files Browse the repository at this point in the history
…HotEncoder (#1135)

* For consistency and avoidance of future issues, replace all occurrences of Pandas' get_dummies with skLearn's OneHotEncoder. Encoder lifespan: Reuses encoders for new estimate_effect() calls, and replaces existing encoders on CausalEstimator.fit(). Additional uses of get_dummies without side-effects or consistent encoding issues in do-Sampler Propensity Scores utilities also replaced for consistency.

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>

* Add categorical encoding consistency tests for CausalEstimators. Fix bug in arg order for RegressionEstimator._do().

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>

---------

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>
Co-authored-by: DAVID RAWLINSON <dave@causalwizard.app>
  • Loading branch information
drawlinson and DAVID RAWLINSON authored Mar 26, 2024
1 parent dfbbbca commit 65f3031
Show file tree
Hide file tree
Showing 14 changed files with 409 additions and 113 deletions.
37 changes: 35 additions & 2 deletions dowhy/causal_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import dowhy.interpreters as interpreters
from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
from dowhy.utils.api import parse_state
from dowhy.utils.encoding import Encoders

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -112,6 +113,35 @@ def __init__(
self._bootstrap_estimates = None
self._bootstrap_null_estimates = None

self._encoders = Encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders.reset()

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.
:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
return self._encoders.encode(data, encoder_name)

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
Expand All @@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
Expand Down Expand Up @@ -234,7 +264,10 @@ def _estimate_conditional_effects(
effect_modifier_names[i] = prefix + str(em)
# Grouping by effect modifiers and computing effect separately
by_effect_mods = data.groupby(effect_modifier_names)
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)

def cond_est_fn(x):
return self._do(self._treatment_value, x) - self._do(self._control_value, x)

conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
# Deleting the temporary categorical columns
for em in effect_modifier_names:
Expand Down
5 changes: 3 additions & 2 deletions dowhy/causal_estimators/causalml.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check the backdoor variables being used
Expand All @@ -127,7 +128,7 @@ def fit(
# Get the data of the unobserved confounders
self._observed_common_causes = data[self._observed_common_causes_names]
# One hot encode the data if they are categorical
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = []

Expand All @@ -138,7 +139,7 @@ def fit(
self._instrumental_variable_names = self._target_estimand.instrumental_variables
if self._instrumental_variable_names:
self._instrumental_variables = data[self._instrumental_variable_names]
self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
else:
self._instrumental_variables = []

Expand Down
3 changes: 2 additions & 1 deletion dowhy/causal_estimators/distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def fit(
"""
self.exact_match_cols = exact_match_cols

self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check if the treatment is one-dimensional
Expand All @@ -146,7 +147,7 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Distance matching methods are not applicable"
Expand Down
9 changes: 5 additions & 4 deletions dowhy/causal_estimators/econml.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)
# Save parameters for later refutter fitting
self._econml_fit_params = kwargs
Expand Down Expand Up @@ -148,12 +149,12 @@ def fit(
# Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
# the latter can be used by other estimator methods later
self._effect_modifiers = data[effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self._effect_modifier_names = effect_modifier_names
self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
if self._observed_common_causes_names:
self._observed_common_causes = data[self._observed_common_causes_names]
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
Expand All @@ -165,7 +166,7 @@ def fit(
self.estimating_instrument_names = parse_state(self.iv_instrument_name)
if self.estimating_instrument_names:
self._estimating_instruments = data[self.estimating_instrument_names]
self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
else:
self._estimating_instruments = None

Expand Down Expand Up @@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None):
"""Returns None if the confidence interval has not been calculated."""
return self.effect_intervals

def _do(self, x):
def _do(self, x, data_df=None):
raise NotImplementedError

def construct_symbolic_estimator(self, estimand):
Expand Down
1 change: 1 addition & 0 deletions dowhy/causal_estimators/instrumental_variable_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.estimating_instrument_names = self._target_estimand.instrumental_variables
Expand Down
4 changes: 3 additions & 1 deletion dowhy/causal_estimators/propensity_score_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
Expand All @@ -103,7 +104,8 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")

else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.rd_variable = data[self.rd_variable_name]
Expand Down
81 changes: 6 additions & 75 deletions dowhy/causal_estimators/regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import statsmodels.api as sm

from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
from dowhy.utils.encoding import one_hot_encode


class RegressionEstimator(CausalEstimator):
Expand Down Expand Up @@ -71,53 +70,6 @@ def __init__(

self.model = None

# Data encoders
# encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
# It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
# Set to False to include a bit for each value of every categorical variable.
self.encoder_drop_first = True
self.reset_encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders = {
"treatment": None,
"observed_common_causes": None,
"effect_modifiers": None,
}

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.
:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
existing_encoder = self._encoders.get(encoder_name)
encoded_variables, encoder = one_hot_encode(
data,
drop_first=self.encoder_drop_first,
encoder=existing_encoder,
)

# Remember encoder
self._encoders[encoder_name] = encoder
return encoded_variables

def fit(
self,
data: pd.DataFrame,
Expand Down Expand Up @@ -170,7 +122,7 @@ def estimate_effect(
need_conditional_estimates = self.need_conditional_estimates
# TODO make treatment_value and control value also as local parameters
# All treatments are set to the same constant value
effect_estimate = self._do(data, treatment_value) - self._do(data, control_value)
effect_estimate = self._do(treatment_value, data) - self._do(control_value, data)
conditional_effect_estimates = None
if need_conditional_estimates:
conditional_effect_estimates = self._estimate_conditional_effects(
Expand All @@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
return est.value

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
:param effect_modifiers: Variables on which to compute separate
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self._effect_modifiers = effect_modifier_names
if effect_modifier_names is not None:
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
else:
self._effect_modifier_names = []

self.need_conditional_estimates = (
self.need_conditional_estimates
if self.need_conditional_estimates != "auto"
else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
)

def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")

Expand Down Expand Up @@ -295,6 +222,10 @@ def predict(self, data_df):
interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
return interventional_outcomes

def _do(self, data_df: pd.DataFrame, treatment_val):
def _do(
self,
treatment_val,
data_df: pd.DataFrame,
):
interventional_outcomes = self.interventional_outcomes(data_df, treatment_val)
return interventional_outcomes.mean()
7 changes: 5 additions & 2 deletions dowhy/causal_estimators/two_stage_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

if len(self._target_estimand.treatment_variable) > 1:
Expand Down Expand Up @@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
treatment_vals = data_df[self._target_estimand.treatment_variable]
if len(self._observed_common_causes_names) > 0:
observed_common_causes_vals = data_df[self._observed_common_causes_names]
observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")

if self._effect_modifier_names:
effect_modifiers_vals = data_df[self._effect_modifier_names]
effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")

if type(treatment_vals) is not np.ndarray:
treatment_vals = treatment_vals.to_numpy()
if treatment_vals.shape[0] != data_df.shape[0]:
Expand Down
Loading

0 comments on commit 65f3031

Please sign in to comment.