Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 14 additions & 78 deletions doubleml/double_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

from .utils.resampling import DoubleMLResampling, DoubleMLClusterResampling
from .utils._estimation import _rmse, _aggregate_coefs_and_ses, _var_est, _set_external_predictions
from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, _check_is_partition, \
_check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _check_benchmarks, _check_external_predictions
from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, \
_check_benchmarks, _check_external_predictions, _check_sample_splitting
from .utils._plots import _sensitivity_contour_plot
from .utils.gain_statistics import gain_statistics

Expand Down Expand Up @@ -289,11 +289,8 @@ def smpls(self):
The partition used for cross-fitting.
"""
if self._smpls is None:
if self._is_cluster_data:
err_msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().'
else:
err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
'or set external samples via .set_sample_splitting().')
err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
'or set external samples via .set_sample_splitting().')
raise ValueError(err_msg)
return self._smpls

Expand All @@ -302,9 +299,6 @@ def smpls_cluster(self):
"""
The partition of clusters used for cross-fitting.
"""
if self._is_cluster_data:
if self._smpls_cluster is None:
raise ValueError('Sample splitting not specified. Draw samples via .draw_sample splitting().')
return self._smpls_cluster

@property
Expand Down Expand Up @@ -1155,7 +1149,7 @@ def draw_sample_splitting(self):

return self

def set_sample_splitting(self, all_smpls):
def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
"""
Set the sample splitting for DoubleML models.

Expand All @@ -1177,6 +1171,13 @@ def set_sample_splitting(self, all_smpls):
train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
``n_folds=1`` and ``n_rep=1`` is always set.

all_smpls_cluster : list or None
Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
the clusters.
Default is ``None``.

Returns
-------
self : object
Expand All @@ -1194,8 +1195,6 @@ def set_sample_splitting(self, all_smpls):
>>> ml_m = learner
>>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
>>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
>>> # simple sample splitting with two folds and without cross-fitting
>>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])
>>> dml_plr_obj.set_sample_splitting(smpls)
>>> # sample splitting with two folds and cross-fitting
>>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
Expand All @@ -1208,71 +1207,8 @@ def set_sample_splitting(self, all_smpls):
>>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
>>> dml_plr_obj.set_sample_splitting(smpls)
"""
if self._is_cluster_data:
raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
'not yet implemented with clustering.')
if isinstance(all_smpls, tuple):
if not len(all_smpls) == 2:
raise ValueError('Invalid partition provided. '
'Tuple for train_ind and test_ind must consist of exactly two elements.')
all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
_check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
self._n_rep = 1
self._n_folds = 1
self._smpls = [[all_smpls]]
else:
raise ValueError('Invalid partition provided. '
'Tuple provided that doesn\'t form a partition.')
else:
if not isinstance(all_smpls, list):
raise TypeError('all_smpls must be of list or tuple type. '
f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
if all_tuple:
if not all([len(tpl) == 2 for tpl in all_smpls]):
raise ValueError('Invalid partition provided. '
'All tuples for train_ind and test_ind must consist of exactly two elements.')
self._n_rep = 1
all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
if _check_is_partition(all_smpls, self._dml_data.n_obs):
if ((len(all_smpls) == 1) &
_check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
self._n_folds = 1
self._smpls = [all_smpls]
else:
self._n_folds = len(all_smpls)
self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
else:
raise ValueError('Invalid partition provided. '
'Tuples provided that don\'t form a partition.')
else:
all_list = all([isinstance(smpl, list) for smpl in all_smpls])
if not all_list:
raise ValueError('Invalid partition provided. '
'all_smpls is a list where neither all elements are tuples '
'nor all elements are lists.')
all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
if not all_tuple:
raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
if not all_pairs:
raise ValueError('Invalid partition provided. '
'All tuples for train_ind and test_ind must consist of exactly two elements.')
n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
raise ValueError('Invalid partition provided. '
'Different number of folds for repeated sample splitting.')
all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]

if all(smpls_are_partitions):
self._n_rep = len(all_smpls)
self._n_folds = n_folds_each_smpl[0]
self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
else:
raise ValueError('Invalid partition provided. '
'At least one inner list does not form a partition.')
self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)

self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \
self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays()
Expand Down
85 changes: 76 additions & 9 deletions doubleml/irm/qte.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..utils._estimation import _default_kde
from ..utils.resampling import DoubleMLResampling
from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment
from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_sample_splitting


class DoubleMLQTE:
Expand Down Expand Up @@ -143,16 +143,15 @@ def __init__(self,
raise TypeError('Normalization indicator has to be boolean. ' +
f'Object of type {str(type(self.normalize_ipw))} passed.')

self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}

# perform sample splitting
self._smpls = None
if draw_sample_splitting:
self.draw_sample_splitting()

self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}

# initialize all models
self._modellist_0, self._modellist_1 = self._initialize_models()
# initialize all models
self._modellist_0, self._modellist_1 = self._initialize_models()

def __str__(self):
class_name = self.__class__.__name__
Expand Down Expand Up @@ -204,8 +203,8 @@ def smpls(self):
The partition used for cross-fitting.
"""
if self._smpls is None:
err_msg = ('Sample splitting not specified. Draw samples via .draw_sample splitting(). ' +
'External samples not implemented yet.')
err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
'or set external samples via .set_sample_splitting().')
raise ValueError(err_msg)
return self._smpls

Expand Down Expand Up @@ -465,6 +464,74 @@ def draw_sample_splitting(self):
n_obs=self._dml_data.n_obs,
stratify=self._dml_data.d)
self._smpls = obj_dml_resampling.split_samples()
# initialize all models
self._modellist_0, self._modellist_1 = self._initialize_models()

return self

def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
"""
Set the sample splitting for DoubleML models.

The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition.

Parameters
----------
all_smpls : list or tuple
If nested list of lists of tuples:
The outer list needs to provide an entry per repeated sample splitting (length of list is set as
``n_rep``).
The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
``n_folds``). test_ind must form a partition for each inner list.
If list of tuples:
The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set.
If tuple:
Must be a tuple with two elements train_ind and test_ind. Only viable option is to set
train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
``n_folds=1`` and ``n_rep=1`` is always set.

all_smpls_cluster : list or None
Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
the clusters.
Default is ``None``.

Returns
-------
self : object

Examples
--------
>>> import numpy as np
>>> import doubleml as dml
>>> from doubleml.datasets import make_plr_CCDDHNR2018
>>> from sklearn.ensemble import RandomForestRegressor
>>> from sklearn.base import clone
>>> np.random.seed(3141)
>>> learner = RandomForestRegressor(max_depth=2, n_estimators=10)
>>> ml_g = learner
>>> ml_m = learner
>>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
>>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
>>> dml_plr_obj.set_sample_splitting(smpls)
>>> # sample splitting with two folds and cross-fitting
>>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
>>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]
>>> dml_plr_obj.set_sample_splitting(smpls)
>>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2
>>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
>>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
>>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
>>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
>>> dml_plr_obj.set_sample_splitting(smpls)
"""
self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)

# initialize all models
self._modellist_0, self._modellist_1 = self._initialize_models()

return self

Expand Down
46 changes: 36 additions & 10 deletions doubleml/irm/tests/test_qte.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,36 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
ml_g = clone(learner)
ml_m = clone(learner)

input_args = {
"quantiles": quantiles,
"n_folds": n_folds,
"n_rep": n_rep,
"normalize_ipw": normalize_ipw,
"trimming_threshold": 1e-12,
"kde": kde
}

np.random.seed(42)
dml_qte_obj = dml.DoubleMLQTE(obj_dml_data,
ml_g, ml_m,
quantiles=quantiles,
n_folds=n_folds,
n_rep=n_rep,
normalize_ipw=normalize_ipw,
trimming_threshold=1e-12,
kde=kde)
dml_qte_obj = dml.DoubleMLQTE(
obj_dml_data,
ml_g, ml_m,
**input_args
)
unfitted_qte_model = copy.copy(dml_qte_obj)
np.random.seed(42)
dml_qte_obj.fit()

np.random.seed(42)
dml_qte_obj_ext_smpls = dml.DoubleMLQTE(
obj_dml_data,
ml_g, ml_m,
draw_sample_splitting=False,
**input_args
)
dml_qte_obj_ext_smpls.set_sample_splitting(dml_qte_obj.smpls)
np.random.seed(42)
dml_qte_obj_ext_smpls.fit()

np.random.seed(42)
n_obs = len(y)
all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
Expand All @@ -80,8 +98,10 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
boot_t_stat=None, joint=False, level=0.95)
res_dict = {'coef': dml_qte_obj.coef,
'coef_manual': res_manual['qte'],
'coef_ext_smpls': dml_qte_obj_ext_smpls.coef,
'se': dml_qte_obj.se,
'se_manual': res_manual['se'],
'se_ext_smpls': dml_qte_obj_ext_smpls.se,
'boot_methods': boot_methods,
'ci': ci.to_numpy(),
'ci_manual': ci_manual.to_numpy(),
Expand Down Expand Up @@ -112,13 +132,19 @@ def test_dml_qte_coef(dml_qte_fixture):
assert np.allclose(dml_qte_fixture['coef'],
dml_qte_fixture['coef_manual'],
rtol=1e-9, atol=1e-4)
assert np.allclose(dml_qte_fixture['coef'],
dml_qte_fixture['coef_ext_smpls'],
rtol=1e-9, atol=1e-4)


@pytest.mark.ci
def test_dml_qte_se(dml_qte_fixture):
assert np.allclose(dml_qte_fixture['se'],
dml_qte_fixture['se_manual'],
rtol=1e-9, atol=1e-4)
assert np.allclose(dml_qte_fixture['se'],
dml_qte_fixture['se_ext_smpls'],
rtol=1e-9, atol=1e-4)


@pytest.mark.ci
Expand Down Expand Up @@ -148,8 +174,8 @@ def test_doubleml_qte_exceptions():
ml_g = RandomForestClassifier(n_estimators=20)
ml_m = RandomForestClassifier(n_estimators=20)

msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). ' \
'External samples not implemented yet.'
msg = ('Sample splitting not specified. '
r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).')
with pytest.raises(ValueError, match=msg):
dml_obj = dml.DoubleMLQTE(obj_dml_data, ml_g, ml_m, draw_sample_splitting=False)
_ = dml_obj.smpls
Expand Down
Loading