Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CATE uplift validation methods #836

Merged
merged 9 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 31 additions & 16 deletions econml/tests/test_drtester.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,15 @@ def test_multi(self):
res = my_dr_tester.evaluate_all(Xval, Xtrain)
res_df = res.summary()

for k in range(3):
if k == 0:
with self.assertRaises(Exception) as exc:
res.plot_cal(k)
self.assertTrue(str(exc.exception) == 'Plotting only supported for treated units (not controls)')
else:
for k in range(4):
if k in [0, 3]:
self.assertRaises(ValueError, res.plot_cal, k)
self.assertRaises(ValueError, res.plot_qini, k)
self.assertRaises(ValueError, res.plot_toc, k)
else: # real treatments, k = 1 or 2
self.assertTrue(res.plot_cal(k) is not None)
self.assertTrue(res.plot_qini(k) is not None)
self.assertTrue(res.plot_toc(k) is not None)

self.assertGreater(res_df.blp_pval.values[0], 0.1) # no heterogeneity
self.assertLess(res_df.blp_pval.values[1], 0.05) # heterogeneity
Expand All @@ -103,6 +105,7 @@ def test_multi(self):
self.assertGreater(res_df.cal_r_squared.values[1], 0) # good R2

self.assertLess(res_df.qini_pval.values[1], res_df.qini_pval.values[0])
self.assertLess(res_df.autoc_pval.values[1], res_df.autoc_pval.values[0])

def test_binary(self):
Xtrain, Dtrain, Ytrain, Xval, Dval, Yval = self._get_data(num_treatments=1)
Expand Down Expand Up @@ -136,17 +139,20 @@ def test_binary(self):
res = my_dr_tester.evaluate_all(Xval, Xtrain)
res_df = res.summary()

for k in range(2):
if k == 0:
with self.assertRaises(Exception) as exc:
res.plot_cal(k)
self.assertTrue(str(exc.exception) == 'Plotting only supported for treated units (not controls)')
else:
for k in range(3):
if k in [0, 2]:
self.assertRaises(ValueError, res.plot_cal, k)
self.assertRaises(ValueError, res.plot_qini, k)
self.assertRaises(ValueError, res.plot_toc, k)
else: # real treatment, k = 1
self.assertTrue(res.plot_cal(k) is not None)
self.assertTrue(res.plot_qini(k) is not None)
self.assertTrue(res.plot_toc(k) is not None)

self.assertLess(res_df.blp_pval.values[0], 0.05) # heterogeneity
self.assertGreater(res_df.cal_r_squared.values[0], 0) # good R2
self.assertLess(res_df.qini_pval.values[0], 0.05) # heterogeneity
self.assertLess(res_df.autoc_pval.values[0], 0.05) # heterogeneity

def test_nuisance_val_fit(self):
Xtrain, Dtrain, Ytrain, Xval, Dval, Yval = self._get_data(num_treatments=1)
Expand Down Expand Up @@ -209,7 +215,7 @@ def test_exceptions(self):
)

# fit nothing
for func in [my_dr_tester.evaluate_blp, my_dr_tester.evaluate_cal, my_dr_tester.evaluate_qini]:
for func in [my_dr_tester.evaluate_blp, my_dr_tester.evaluate_cal, my_dr_tester.evaluate_uplift]:
with self.assertRaises(Exception) as exc:
func()
if func.__name__ == 'evaluate_cal':
Expand All @@ -226,7 +232,7 @@ def test_exceptions(self):
for func in [
my_dr_tester.evaluate_blp,
my_dr_tester.evaluate_cal,
my_dr_tester.evaluate_qini,
my_dr_tester.evaluate_uplift,
my_dr_tester.evaluate_all
]:
with self.assertRaises(Exception) as exc:
Expand All @@ -241,7 +247,7 @@ def test_exceptions(self):

for func in [
my_dr_tester.evaluate_cal,
my_dr_tester.evaluate_qini,
my_dr_tester.evaluate_uplift,
my_dr_tester.evaluate_all
]:
with self.assertRaises(Exception) as exc:
Expand All @@ -252,12 +258,21 @@ def test_exceptions(self):
cal_res = my_dr_tester.evaluate_cal(Xval, Xtrain)
self.assertGreater(cal_res.cal_r_squared[0], 0) # good R2

with self.assertRaises(Exception) as exc:
my_dr_tester.evaluate_uplift(metric='blah')
self.assertTrue(
str(exc.exception) == "Unsupported metric - must be one of ['toc', 'qini']"
)

my_dr_tester = DRtester(
model_regression=reg_y,
model_propensity=reg_t,
cate=cate
).fit_nuisance(
Xval, Dval, Yval, Xtrain, Dtrain, Ytrain
)
qini_res = my_dr_tester.evaluate_qini(Xval, Xtrain)
qini_res = my_dr_tester.evaluate_uplift(Xval, Xtrain)
self.assertLess(qini_res.pvals[0], 0.05)

autoc_res = my_dr_tester.evaluate_uplift(Xval, Xtrain, metric='toc')
self.assertLess(autoc_res.pvals[0], 0.05)
78 changes: 46 additions & 32 deletions econml/validate/drtester.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from statsmodels.api import OLS
from statsmodels.tools import add_constant

from .results import CalibrationEvaluationResults, BLPEvaluationResults, QiniEvaluationResults, EvaluationResults
from .utils import calculate_dr_outcomes, calc_qini_coeff
from .results import CalibrationEvaluationResults, BLPEvaluationResults, UpliftEvaluationResults, EvaluationResults
from .utils import calculate_dr_outcomes, calc_uplift


class DRtester:
Expand Down Expand Up @@ -382,7 +382,7 @@ def evaluate_cal(
self.get_cate_preds(Xval, Xtrain)

cal_r_squared = np.zeros(self.n_treat)
df_plot = pd.DataFrame()
plot_data_dict = dict()
for k in range(self.n_treat):
cuts = np.quantile(self.cate_preds_train_[:, k], np.linspace(0, 1, n_groups + 1))
probs = np.zeros(n_groups)
Expand All @@ -409,15 +409,19 @@ def evaluate_cal(
# Calculate R-square calibration score
cal_r_squared[k] = 1 - (cal_score_g / cal_score_o)

df_plot1 = pd.DataFrame({'ind': np.array(range(n_groups)),
'gate': gate, 'se_gate': se_gate,
'g_cate': g_cate, 'se_g_cate': se_g_cate})
df_plot1['tmt'] = self.treatments[k + 1]
df_plot = pd.concat((df_plot, df_plot1))
df_plot = pd.DataFrame({
'ind': np.array(range(n_groups)),
'gate': gate,
'se_gate': se_gate,
'g_cate': g_cate,
'se_g_cate': se_g_cate
})

plot_data_dict[self.treatments[k + 1]] = df_plot

self.cal_res = CalibrationEvaluationResults(
cal_r_squared=cal_r_squared,
df_plot=df_plot,
plot_data_dict=plot_data_dict,
treatments=self.treatments
)

Expand Down Expand Up @@ -480,12 +484,13 @@ def evaluate_blp(

return self.blp_res

def evaluate_qini(
def evaluate_uplift(
self,
Xval: np.array = None,
Xtrain: np.array = None,
percentiles: np.array = np.linspace(5, 95, 50)
) -> QiniEvaluationResults:
percentiles: np.array = np.linspace(5, 95, 50),
metric: str = 'qini'
) -> UpliftEvaluationResults:
"""
Calculates QINI coefficient for the given model as in Radcliffe (2007), where units are ordered by predicted
CATE values and a running measure of the average treatment effect in each cohort is kept as we progress
Expand All @@ -505,10 +510,12 @@ def evaluate_qini(
percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)''
Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of
5%.
metric: string, default 'qini'
Which type of uplift curve to evaluate. Must be one of ['toc', 'qini']

Returns
-------
QiniEvaluationResults object showing the results of the QINI fit
UpliftEvaluationResults object showing the fitted results
"""
if not hasattr(self, 'dr_val_'):
raise Exception("Must fit nuisances before evaluating")
Expand All @@ -518,39 +525,44 @@ def evaluate_qini(
raise Exception('CATE predictions not yet calculated - must provide both Xval, Xtrain')
self.get_cate_preds(Xval, Xtrain)

curve_data_dict = dict()
if self.n_treat == 1:
qini, qini_err = calc_qini_coeff(
coeff, err, curve_df = calc_uplift(
self.cate_preds_train_,
self.cate_preds_val_,
self.dr_val_,
percentiles
percentiles,
metric
)
qinis = [qini]
errs = [qini_err]
coeffs = [coeff]
errs = [err]
curve_data_dict[self.treatments[1]] = curve_df
else:
qinis = []
coeffs = []
errs = []
for k in range(self.n_treat):
qini, qini_err = calc_qini_coeff(
coeff, err, curve_df = calc_uplift(
self.cate_preds_train_[:, k],
self.cate_preds_val_[:, k],
self.dr_val_[:, k],
percentiles
percentiles,
metric
)
coeffs.append(coeff)
errs.append(err)
curve_data_dict[self.treatments[k + 1]] = curve_df

qinis.append(qini)
errs.append(qini_err)

pvals = [st.norm.sf(abs(q / e)) for q, e in zip(qinis, errs)]
pvals = [st.norm.sf(abs(q / e)) for q, e in zip(coeffs, errs)]

self.qini_res = QiniEvaluationResults(
params=qinis,
self.uplift_res = UpliftEvaluationResults(
params=coeffs,
errs=errs,
pvals=pvals,
treatments=self.treatments
treatments=self.treatments,
curve_data_dict=curve_data_dict
)

return self.qini_res
return self.uplift_res

def evaluate_all(
self,
Expand All @@ -559,8 +571,8 @@ def evaluate_all(
n_groups: int = 4
) -> EvaluationResults:
"""
Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal') and QINI coefficient
(`evaluate_qini') methods.
Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal'), uplift curve
('evaluate_uplift') methods

Parameters
----------
Expand All @@ -583,12 +595,14 @@ def evaluate_all(

blp_res = self.evaluate_blp()
cal_res = self.evaluate_cal(n_groups=n_groups)
qini_res = self.evaluate_qini()
qini_res = self.evaluate_uplift(metric='qini')
toc_res = self.evaluate_uplift(metric='toc')

self.res = EvaluationResults(
blp_res=blp_res,
cal_res=cal_res,
qini_res=qini_res
qini_res=qini_res,
toc_res=toc_res
)

return self.res
Loading