Skip to content

Commit

Permalink
Handle pandas categorical types for categorical columns in _causal_an…
Browse files Browse the repository at this point in the history
…alysis.py (#602)

If the categorical type is set for a treatment column explicitly then there is a failure in `CausalAnalysis` class.

```
~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\econml\solutions\causal_analysis\_causal_analysis.py in individualized_policy(self, Xtest, feature_index, n_rows, treatment_costs, alpha)
   1714                 all_costs = np.array([0] + [treatment_costs] * (len(treatment_arr) - 1))
   1715                 # construct index of current treatment
-> 1716                 current_ind = (current_treatment.reshape(-1, 1) ==
   1717                                treatment_arr.reshape(1, -1)) @ np.arange(len(treatment_arr))
   1718                 current_cost = all_costs[current_ind]

~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
     67         other = item_from_zerodim(other)
     68 
---> 69         return method(self, other)
     70 
     71     return new_method

~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\pandas\core\arrays\categorical.py in func(self, other)
    131         if is_list_like(other) and len(other) != len(self) and not hashable:
    132             # in hashable case we may have a tuple that is itself a category
--> 133             raise ValueError("Lengths must match.")
    134 
    135         if not self.ordered:
```
Solution is to check for the type of the categorical column to see if it is of type `pd.core.arrays.categorical.Categorical` and extract the numpy array using `to_numpy()` method.
  • Loading branch information
gaugup authored Jun 13, 2022
1 parent 5cf6920 commit ac12f54
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 104 deletions.
2 changes: 2 additions & 0 deletions econml/solutions/causal_analysis/_causal_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,8 @@ def individualized_policy(self, Xtest, feature_index, *, n_rows=None, treatment_
effect = result.estimator.effect_inference(Xtest, T0=orig_df['Current treatment'], T1=rec)
# we now need to construct the delta in the cost between the two treatments and translate the effect
current_treatment = orig_df['Current treatment'].values
if isinstance(current_treatment, pd.core.arrays.categorical.Categorical):
current_treatment = current_treatment.to_numpy()
if np.ndim(treatment_costs) >= 2:
# remove third dimenions potentially added
if multi_y: # y was an array, not a vector
Expand Down
224 changes: 120 additions & 104 deletions econml/tests/test_causal_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def test_basic_array(self):
# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

ind_pol = ca.individualized_policy(X, inds[idx])

# global shape is (d_y, sum(d_t))
assert glo_point_est.shape == coh_point_est.shape == (1, 5)
assert loc_point_est.shape == (2,) + glo_point_est.shape
Expand Down Expand Up @@ -128,113 +130,121 @@ def test_basic_array(self):

def test_basic_pandas(self):
for classification in [False, True]:
y = pd.Series(np.random.choice([0, 1], size=(500,)))
X = pd.DataFrame({'a': np.random.normal(size=500),
'b': np.random.normal(size=500),
'c': np.random.choice([0, 1], size=500),
'd': np.random.choice(['a', 'b', 'c'], size=500)})
n_inds = [0, 1, 2, 3]
t_inds = ['a', 'b', 'c', 'd']
n_cats = [2, 3]
t_cats = ['c', 'd']
n_hinds = [0, 3]
t_hinds = ['a', 'd']
for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]:
ca = CausalAnalysis(inds, cats, hinds, classification=classification)
ca.fit(X, y)
glo = ca.global_causal_effect()
coh = ca.cohort_causal_effect(X[:2])
loc = ca.local_causal_effect(X[:2])

# global and cohort data should have exactly the same structure, but different values
assert glo.index.equals(coh.index)

# local index should have as many times entries as global as there were rows passed in
assert len(loc.index) == 2 * len(glo.index)

assert glo.index.names == ['feature', 'feature_value']
assert loc.index.names == ['sample'] + glo.index.names

# features; for categoricals they should appear #cats-1 times each
fts = ['a', 'b', 'c', 'd', 'd']

for i in range(len(fts)):
assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[len(fts) + i][1]

glo_dict = ca._global_causal_effect_dict()
glo_dict2 = ca._global_causal_effect_dict(row_wise=True)

coh_dict = ca._cohort_causal_effect_dict(X[:2])
coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)

loc_dict = ca._local_causal_effect_dict(X[:2])
loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)

glo_point_est = np.array(glo_dict[_CausalInsightsConstants.PointEstimateKey])
coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])

# global shape is (d_y, sum(d_t))
assert glo_point_est.shape == coh_point_est.shape == (1, 5)
assert loc_point_est.shape == (2,) + glo_point_est.shape

# global and cohort row-wise dicts have d_y * d_t entries
assert len(
glo_dict2[_CausalInsightsConstants.RowData]) == len(
coh_dict2[_CausalInsightsConstants.RowData]) == 5
# local dictionary is flattened to n_rows * d_y * d_t
assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10

pto = ca._policy_tree_output(X, inds[1])
ca._heterogeneity_tree_output(X, inds[1])
ca._heterogeneity_tree_output(X, inds[3])

# continuous treatments have typical treatment values equal to
# the mean of the absolute value of non-zero entries
np.testing.assert_allclose(ca.typical_treatment_value(inds[0]), np.mean(np.abs(X['a'])))
np.testing.assert_allclose(ca.typical_treatment_value(inds[1]), np.mean(np.abs(X['b'])))
# discrete treatments have typical treatment value 1
assert ca.typical_treatment_value(inds[2]) == ca.typical_treatment_value(inds[3]) == 1

# Make sure we handle continuous, binary, and multi-class treatments
# For multiple discrete treatments, one "always treat" value per non-default treatment
for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
pto = ca._policy_tree_output(X, inds[idx])
policy_val = pto.policy_value
always_trt = pto.always_treat
assert isinstance(pto.control_name, str)
assert isinstance(always_trt, dict)
assert np.array(policy_val).shape == ()
assert len(always_trt) == length
for val in always_trt.values():
assert np.array(val).shape == ()

# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

if not classification:
# ExitStack can be used as a "do nothing" ContextManager
cm = ExitStack()
else:
cm = self.assertRaises(Exception)
with cm:
inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
assert np.shape(inf.point_estimate) == np.shape(y[:2])
inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[2], y[:2])
assert np.shape(inf.point_estimate) == np.shape(y[:2])
for category in [False, True]:
y = pd.Series(np.random.choice([0, 1], size=(500,)))
X = pd.DataFrame({'a': np.random.normal(size=500),
'b': np.random.normal(size=500),
'c': np.random.choice([0, 1], size=500),
'd': np.random.choice(['a', 'b', 'c'], size=500)})

if category:
X['c'] = X['c'].astype('category')
X['d'] = X['d'].astype('category')

n_inds = [0, 1, 2, 3]
t_inds = ['a', 'b', 'c', 'd']
n_cats = [2, 3]
t_cats = ['c', 'd']
n_hinds = [0, 3]
t_hinds = ['a', 'd']
for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]:
ca = CausalAnalysis(inds, cats, hinds, classification=classification)
ca.fit(X, y)
glo = ca.global_causal_effect()
coh = ca.cohort_causal_effect(X[:2])
loc = ca.local_causal_effect(X[:2])

# global and cohort data should have exactly the same structure, but different values
assert glo.index.equals(coh.index)

# local index should have as many times entries as global as there were rows passed in
assert len(loc.index) == 2 * len(glo.index)

assert glo.index.names == ['feature', 'feature_value']
assert loc.index.names == ['sample'] + glo.index.names

# features; for categoricals they should appear #cats-1 times each
fts = ['a', 'b', 'c', 'd', 'd']

for i in range(len(fts)):
assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[len(fts) + i][1]

glo_dict = ca._global_causal_effect_dict()
glo_dict2 = ca._global_causal_effect_dict(row_wise=True)

coh_dict = ca._cohort_causal_effect_dict(X[:2])
coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)

loc_dict = ca._local_causal_effect_dict(X[:2])
loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)

glo_point_est = np.array(glo_dict[_CausalInsightsConstants.PointEstimateKey])
coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])

# global shape is (d_y, sum(d_t))
assert glo_point_est.shape == coh_point_est.shape == (1, 5)
assert loc_point_est.shape == (2,) + glo_point_est.shape

# global and cohort row-wise dicts have d_y * d_t entries
assert len(
glo_dict2[_CausalInsightsConstants.RowData]) == len(
coh_dict2[_CausalInsightsConstants.RowData]) == 5
# local dictionary is flattened to n_rows * d_y * d_t
assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10

pto = ca._policy_tree_output(X, inds[1])
ca._heterogeneity_tree_output(X, inds[1])
ca._heterogeneity_tree_output(X, inds[3])

# continuous treatments have typical treatment values equal to
# the mean of the absolute value of non-zero entries
np.testing.assert_allclose(ca.typical_treatment_value(inds[0]), np.mean(np.abs(X['a'])))
np.testing.assert_allclose(ca.typical_treatment_value(inds[1]), np.mean(np.abs(X['b'])))
# discrete treatments have typical treatment value 1
assert ca.typical_treatment_value(inds[2]) == ca.typical_treatment_value(inds[3]) == 1

# Make sure we handle continuous, binary, and multi-class treatments
# For multiple discrete treatments, one "always treat" value per non-default treatment
for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
pto = ca._policy_tree_output(X, inds[idx])
policy_val = pto.policy_value
always_trt = pto.always_treat
assert isinstance(pto.control_name, str)
assert isinstance(always_trt, dict)
assert np.array(policy_val).shape == ()
assert len(always_trt) == length
for val in always_trt.values():
assert np.array(val).shape == ()

# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

ind_pol = ca.individualized_policy(X, inds[idx])

if not classification:
# ExitStack can be used as a "do nothing" ContextManager
cm = ExitStack()
else:
cm = self.assertRaises(Exception)
with cm:
inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
assert np.shape(inf.point_estimate) == np.shape(y[:2])
inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[2], y[:2])
assert np.shape(inf.point_estimate) == np.shape(y[:2])

ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2], row_wise=True)
ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2], row_wise=True)

badargs = [
(n_inds, n_cats, [4]), # hinds out of range
(n_inds, n_cats, ["test"]) # hinds out of range
]
badargs = [
(n_inds, n_cats, [4]), # hinds out of range
(n_inds, n_cats, ["test"]) # hinds out of range
]

for args in badargs:
with self.assertRaises(Exception):
ca = CausalAnalysis(*args)
ca.fit(X, y)
for args in badargs:
with self.assertRaises(Exception):
ca = CausalAnalysis(*args)
ca.fit(X, y)

def test_automl_first_stage(self):
d_y = (1,)
Expand Down Expand Up @@ -294,6 +304,8 @@ def test_automl_first_stage(self):
# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

ind_pol = ca.individualized_policy(X, inds[idx])

# global shape is (d_y, sum(d_t))
assert glo_point_est.shape == coh_point_est.shape == (1, 5)
assert loc_point_est.shape == (2,) + glo_point_est.shape
Expand Down Expand Up @@ -436,6 +448,8 @@ def test_final_models(self):
# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

ind_pol = ca.individualized_policy(X, inds[idx])

if not classification:
# ExitStack can be used as a "do nothing" ContextManager
cm = ExitStack()
Expand Down Expand Up @@ -526,6 +540,8 @@ def test_forest_with_pandas(self):
# policy value should exceed always treating with any treatment
assert_less_close(np.array(list(always_trt.values())), policy_val)

ind_pol = ca.individualized_policy(X, inds[idx])

def test_warm_start(self):
for classification in [True, False]:
# dgp
Expand Down

0 comments on commit ac12f54

Please sign in to comment.