Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace all occurrences of get Pandas' get_dummies() with skLearn OneHotEncoder #1135

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions dowhy/causal_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import dowhy.interpreters as interpreters
from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
from dowhy.utils.api import parse_state
from dowhy.utils.encoding import Encoders

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -112,6 +113,35 @@ def __init__(
self._bootstrap_estimates = None
self._bootstrap_null_estimates = None

self._encoders = Encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders.reset()

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.
:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
return self._encoders.encode(data, encoder_name)

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
Expand All @@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
Expand Down Expand Up @@ -234,7 +264,10 @@ def _estimate_conditional_effects(
effect_modifier_names[i] = prefix + str(em)
# Grouping by effect modifiers and computing effect separately
by_effect_mods = data.groupby(effect_modifier_names)
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)

def cond_est_fn(x):
return self._do(self._treatment_value, x) - self._do(self._control_value, x)

conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
# Deleting the temporary categorical columns
for em in effect_modifier_names:
Expand Down
5 changes: 3 additions & 2 deletions dowhy/causal_estimators/causalml.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check the backdoor variables being used
Expand All @@ -127,7 +128,7 @@ def fit(
# Get the data of the unobserved confounders
self._observed_common_causes = data[self._observed_common_causes_names]
# One hot encode the data if they are categorical
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = []

Expand All @@ -138,7 +139,7 @@ def fit(
self._instrumental_variable_names = self._target_estimand.instrumental_variables
if self._instrumental_variable_names:
self._instrumental_variables = data[self._instrumental_variable_names]
self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
else:
self._instrumental_variables = []

Expand Down
3 changes: 2 additions & 1 deletion dowhy/causal_estimators/distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def fit(
"""
self.exact_match_cols = exact_match_cols

self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check if the treatment is one-dimensional
Expand All @@ -146,7 +147,7 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Distance matching methods are not applicable"
Expand Down
9 changes: 5 additions & 4 deletions dowhy/causal_estimators/econml.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)
# Save parameters for later refutter fitting
self._econml_fit_params = kwargs
Expand Down Expand Up @@ -148,12 +149,12 @@ def fit(
# Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
# the latter can be used by other estimator methods later
self._effect_modifiers = data[effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self._effect_modifier_names = effect_modifier_names
self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
if self._observed_common_causes_names:
self._observed_common_causes = data[self._observed_common_causes_names]
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
Expand All @@ -165,7 +166,7 @@ def fit(
self.estimating_instrument_names = parse_state(self.iv_instrument_name)
if self.estimating_instrument_names:
self._estimating_instruments = data[self.estimating_instrument_names]
self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
else:
self._estimating_instruments = None

Expand Down Expand Up @@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None):
"""Returns None if the confidence interval has not been calculated."""
return self.effect_intervals

def _do(self, x):
def _do(self, x, data_df=None):
raise NotImplementedError

def construct_symbolic_estimator(self, estimand):
Expand Down
1 change: 1 addition & 0 deletions dowhy/causal_estimators/instrumental_variable_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.estimating_instrument_names = self._target_estimand.instrumental_variables
Expand Down
4 changes: 3 additions & 1 deletion dowhy/causal_estimators/propensity_score_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
Expand All @@ -103,7 +104,8 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")

else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.rd_variable = data[self.rd_variable_name]
Expand Down
81 changes: 6 additions & 75 deletions dowhy/causal_estimators/regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import statsmodels.api as sm

from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
from dowhy.utils.encoding import one_hot_encode


class RegressionEstimator(CausalEstimator):
Expand Down Expand Up @@ -71,53 +70,6 @@ def __init__(

self.model = None

# Data encoders
# encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
# It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
# Set to False to include a bit for each value of every categorical variable.
self.encoder_drop_first = True
self.reset_encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders = {
"treatment": None,
"observed_common_causes": None,
"effect_modifiers": None,
}

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.
:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
existing_encoder = self._encoders.get(encoder_name)
encoded_variables, encoder = one_hot_encode(
data,
drop_first=self.encoder_drop_first,
encoder=existing_encoder,
)

# Remember encoder
self._encoders[encoder_name] = encoder
return encoded_variables

def fit(
self,
data: pd.DataFrame,
Expand Down Expand Up @@ -170,7 +122,7 @@ def estimate_effect(
need_conditional_estimates = self.need_conditional_estimates
# TODO make treatment_value and control value also as local parameters
# All treatments are set to the same constant value
effect_estimate = self._do(data, treatment_value) - self._do(data, control_value)
effect_estimate = self._do(treatment_value, data) - self._do(control_value, data)
conditional_effect_estimates = None
if need_conditional_estimates:
conditional_effect_estimates = self._estimate_conditional_effects(
Expand All @@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
return est.value

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
drawlinson marked this conversation as resolved.
Show resolved Hide resolved
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
:param effect_modifiers: Variables on which to compute separate
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self._effect_modifiers = effect_modifier_names
if effect_modifier_names is not None:
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
else:
self._effect_modifier_names = []

self.need_conditional_estimates = (
self.need_conditional_estimates
if self.need_conditional_estimates != "auto"
else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
)

def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")

Expand Down Expand Up @@ -295,6 +222,10 @@ def predict(self, data_df):
interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
return interventional_outcomes

def _do(self, data_df: pd.DataFrame, treatment_val):
def _do(
self,
treatment_val,
data_df: pd.DataFrame,
):
interventional_outcomes = self.interventional_outcomes(data_df, treatment_val)
return interventional_outcomes.mean()
7 changes: 5 additions & 2 deletions dowhy/causal_estimators/two_stage_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

if len(self._target_estimand.treatment_variable) > 1:
Expand Down Expand Up @@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
treatment_vals = data_df[self._target_estimand.treatment_variable]
if len(self._observed_common_causes_names) > 0:
observed_common_causes_vals = data_df[self._observed_common_causes_names]
observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")

if self._effect_modifier_names:
effect_modifiers_vals = data_df[self._effect_modifier_names]
effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")

if type(treatment_vals) is not np.ndarray:
treatment_vals = treatment_vals.to_numpy()
if treatment_vals.shape[0] != data_df.shape[0]:
Expand Down
Loading
Loading