From c61b1e4970d975402a4a19b95f4797c0de28e751 Mon Sep 17 00:00:00 2001 From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Fri, 19 Jul 2024 08:27:50 +0200 Subject: [PATCH 1/5] update set_sample_splitting documentation --- doubleml/double_ml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 5fe6decf2..00d64d8f8 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1194,8 +1194,6 @@ def set_sample_splitting(self, all_smpls): >>> ml_m = learner >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> # simple sample splitting with two folds and without cross-fitting - >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]) >>> dml_plr_obj.set_sample_splitting(smpls) >>> # sample splitting with two folds and cross-fitting >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), From 02abf21e0e5c6ab620c6c5c48ec3ce8c9e8a057d Mon Sep 17 00:00:00 2001 From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:38:36 +0200 Subject: [PATCH 2/5] add set_sample_splitting to _checks to simplify doubleml class --- doubleml/double_ml.py | 71 +++------------------------------------ doubleml/utils/_checks.py | 70 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 67 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 00d64d8f8..a77cc7b06 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -15,8 +15,8 @@ from .utils.resampling import DoubleMLResampling, DoubleMLClusterResampling from .utils._estimation import _rmse, _aggregate_coefs_and_ses, _var_est, _set_external_predictions -from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, _check_is_partition, \ - _check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _check_benchmarks, _check_external_predictions +from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, \ + _check_benchmarks, _check_external_predictions, _check_sample_splitting from .utils._plots import _sensitivity_contour_plot from .utils.gain_statistics import gain_statistics @@ -1206,71 +1206,8 @@ def set_sample_splitting(self, all_smpls): >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] >>> dml_plr_obj.set_sample_splitting(smpls) """ - if self._is_cluster_data: - raise NotImplementedError('Externally setting the sample splitting for DoubleML is ' - 'not yet implemented with clustering.') - if isinstance(all_smpls, tuple): - if not len(all_smpls) == 2: - raise ValueError('Invalid partition provided. ' - 'Tuple for train_ind and test_ind must consist of exactly two elements.') - all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs) - if (_check_is_partition([all_smpls], self._dml_data.n_obs) & - _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)): - self._n_rep = 1 - self._n_folds = 1 - self._smpls = [[all_smpls]] - else: - raise ValueError('Invalid partition provided. ' - 'Tuple provided that doesn\'t form a partition.') - else: - if not isinstance(all_smpls, list): - raise TypeError('all_smpls must be of list or tuple type. ' - f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.') - all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls]) - if all_tuple: - if not all([len(tpl) == 2 for tpl in all_smpls]): - raise ValueError('Invalid partition provided. ' - 'All tuples for train_ind and test_ind must consist of exactly two elements.') - self._n_rep = 1 - all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs) - if _check_is_partition(all_smpls, self._dml_data.n_obs): - if ((len(all_smpls) == 1) & - _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)): - self._n_folds = 1 - self._smpls = [all_smpls] - else: - self._n_folds = len(all_smpls) - self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True) - else: - raise ValueError('Invalid partition provided. ' - 'Tuples provided that don\'t form a partition.') - else: - all_list = all([isinstance(smpl, list) for smpl in all_smpls]) - if not all_list: - raise ValueError('Invalid partition provided. ' - 'all_smpls is a list where neither all elements are tuples ' - 'nor all elements are lists.') - all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls]) - if not all_tuple: - raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.') - all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls]) - if not all_pairs: - raise ValueError('Invalid partition provided. ' - 'All tuples for train_ind and test_ind must consist of exactly two elements.') - n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls]) - if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]): - raise ValueError('Invalid partition provided. ' - 'Different number of folds for repeated sample splitting.') - all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs) - smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls] - - if all(smpls_are_partitions): - self._n_rep = len(all_smpls) - self._n_folds = n_folds_each_smpl[0] - self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True) - else: - raise ValueError('Invalid partition provided. ' - 'At least one inner list does not form a partition.') + self._smpls, self._n_rep, self._n_folds = _check_sample_splitting( + all_smpls, self._dml_data.n_obs, self._is_cluster_data) self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \ self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays() diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index aa736a0af..e153b3992 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -351,3 +351,73 @@ def _check_framework_compatibility(dml_framework_1, dml_framework_2, check_treat def _check_set(x): return {x} if x is not None else {} + + +def _check_sample_splitting(all_smpls, n_obs, is_cluster_data): + if is_cluster_data: + raise NotImplementedError('Externally setting the sample splitting for DoubleML is ' + 'not yet implemented with clustering.') + if isinstance(all_smpls, tuple): + if not len(all_smpls) == 2: + raise ValueError('Invalid partition provided. ' + 'Tuple for train_ind and test_ind must consist of exactly two elements.') + all_smpls = _check_smpl_split_tpl(all_smpls, n_obs) + if (_check_is_partition([all_smpls], n_obs) & + _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs)): + n_rep = 1 + n_folds = 1 + smpls = [[all_smpls]] + else: + raise ValueError('Invalid partition provided. ' + 'Tuple provided that doesn\'t form a partition.') + else: + if not isinstance(all_smpls, list): + raise TypeError('all_smpls must be of list or tuple type. ' + f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.') + all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls]) + if all_tuple: + if not all([len(tpl) == 2 for tpl in all_smpls]): + raise ValueError('Invalid partition provided. ' + 'All tuples for train_ind and test_ind must consist of exactly two elements.') + n_rep = 1 + all_smpls = _check_smpl_split(all_smpls, n_obs) + if _check_is_partition(all_smpls, n_obs): + if ((len(all_smpls) == 1) & + _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs)): + n_folds = 1 + smpls = [all_smpls] + else: + n_folds = len(all_smpls) + smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True) + else: + raise ValueError('Invalid partition provided. ' + 'Tuples provided that don\'t form a partition.') + else: + all_list = all([isinstance(smpl, list) for smpl in all_smpls]) + if not all_list: + raise ValueError('Invalid partition provided. ' + 'all_smpls is a list where neither all elements are tuples ' + 'nor all elements are lists.') + all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls]) + if not all_tuple: + raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.') + all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls]) + if not all_pairs: + raise ValueError('Invalid partition provided. ' + 'All tuples for train_ind and test_ind must consist of exactly two elements.') + n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls]) + if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]): + raise ValueError('Invalid partition provided. ' + 'Different number of folds for repeated sample splitting.') + all_smpls = _check_all_smpls(all_smpls, n_obs) + smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls] + + if all(smpls_are_partitions): + n_rep = len(all_smpls) + n_folds = n_folds_each_smpl[0] + smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True) + else: + raise ValueError('Invalid partition provided. ' + 'At least one inner list does not form a partition.') + + return smpls, n_rep, n_folds From 2ba36fde97e387c31d9b15197f7e595ca2d94305 Mon Sep 17 00:00:00 2001 From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:00:36 +0200 Subject: [PATCH 3/5] add set_sample_splitting for doubleml with cluster data and exception tests --- doubleml/double_ml.py | 13 ++++-- doubleml/tests/test_exceptions.py | 45 +++++++++++++++++--- doubleml/utils/_checks.py | 69 ++++++++++++++++++++++++------- 3 files changed, 103 insertions(+), 24 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index a77cc7b06..dde61d285 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1155,7 +1155,7 @@ def draw_sample_splitting(self): return self - def set_sample_splitting(self, all_smpls): + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): """ Set the sample splitting for DoubleML models. @@ -1177,6 +1177,13 @@ def set_sample_splitting(self, all_smpls): train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. ``n_folds=1`` and ``n_rep=1`` is always set. + all_smpls_cluster : list or None + Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level + of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and + testing lists. Both training and testing contain an array for each cluster variable, which form a partition of + the clusters. + Default is ``None``. + Returns ------- self : object @@ -1206,8 +1213,8 @@ def set_sample_splitting(self, all_smpls): >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] >>> dml_plr_obj.set_sample_splitting(smpls) """ - self._smpls, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, self._dml_data.n_obs, self._is_cluster_data) + self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( + all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data) self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \ self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays() diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 84ac55a14..4bd11d939 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -1,6 +1,7 @@ import pytest import pandas as pd import numpy as np +import copy from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, \ DoubleMLClusterData, DoubleMLPQ, DoubleMLLPQ, DoubleMLCVAR, DoubleMLQTE, DoubleMLDID, \ @@ -652,6 +653,44 @@ def test_doubleml_exception_smpls(): with pytest.raises(ValueError, match=msg): _ = dml_pliv_cluster_no_smpls.smpls + dml_pliv_cluster = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r) + smpls = dml_plr.smpls + msg = ('For cluster data, all_smpls_cluster must be provided.') + with pytest.raises(ValueError, match=msg): + _ = dml_pliv_cluster.set_sample_splitting(smpls) + + all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster) + all_smpls_cluster.append(all_smpls_cluster[0]) + msg = ('Invalid samples provided. Number of repetitions for all_smpls and all_smpls_cluster must be the same.') + with pytest.raises(ValueError, match=msg): + _ = dml_pliv_cluster.set_sample_splitting( + all_smpls=dml_pliv_cluster.smpls, + all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster) + all_smpls_cluster[0] = all_smpls_cluster[0][0] + msg = ('Invalid samples provided. Number of folds for all_smpls and all_smpls_cluster must be the same.') + with pytest.raises(ValueError, match=msg): + _ = dml_pliv_cluster.set_sample_splitting( + all_smpls=dml_pliv_cluster.smpls, + all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster) + all_smpls_cluster[0][0][1][1] = np.append(all_smpls_cluster[0][0][1][1], [11], axis=0) + msg = ('Invalid cluster partition provided. At least one inner list does not form a partition.') + with pytest.raises(ValueError, match=msg): + _ = dml_pliv_cluster.set_sample_splitting( + all_smpls=dml_pliv_cluster.smpls, + all_smpls_cluster=all_smpls_cluster) + + all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster) + all_smpls_cluster[0][0][1][1][1] = 11 + msg = ('Invalid cluster partition provided. At least one inner list does not form a partition.') + with pytest.raises(ValueError, match=msg): + _ = dml_pliv_cluster.set_sample_splitting( + all_smpls=dml_pliv_cluster.smpls, + all_smpls_cluster=all_smpls_cluster) + @pytest.mark.ci def test_doubleml_exception_fit(): @@ -1212,12 +1251,6 @@ def test_doubleml_cluster_not_yet_implemented(): with pytest.raises(NotImplementedError, match=msg): _ = dml_pliv_cluster.bootstrap() - smpls = dml_plr.smpls - msg = ('Externally setting the sample splitting for DoubleML is ' - 'not yet implemented with clustering.') - with pytest.raises(NotImplementedError, match=msg): - _ = dml_pliv_cluster.set_sample_splitting(smpls) - df = dml_cluster_data_pliv.data.copy() df['cluster_var_k'] = df['cluster_var_i'] + df['cluster_var_j'] - 2 dml_cluster_data_multiway = DoubleMLClusterData(df, y_col='Y', d_cols='D', x_cols=['X1', 'X5'], z_cols='Z', diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index e153b3992..7ecadebd4 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -353,17 +353,51 @@ def _check_set(x): return {x} if x is not None else {} -def _check_sample_splitting(all_smpls, n_obs, is_cluster_data): - if is_cluster_data: - raise NotImplementedError('Externally setting the sample splitting for DoubleML is ' - 'not yet implemented with clustering.') +def _check_cluster_partitions(smpls, values): + test_indices = np.concatenate([test_index for test_index in smpls]) + if len(test_indices) != len(values): + return False + if np.any(np.sort(test_indices) != np.sort(values)): + return False + return True + + +def _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds): + if all_smpls_cluster is None: + raise ValueError('For cluster data, all_smpls_cluster must be provided.') + + n_rep_cluster = len(all_smpls_cluster) + if n_rep_cluster != n_rep: + raise ValueError('Invalid samples provided. ' + 'Number of repetitions for all_smpls and all_smpls_cluster must be the same.') + + for i_rep in range(n_rep): + n_folds_cluster = len(all_smpls_cluster[i_rep]) + if n_folds_cluster != n_folds: + raise ValueError('Invalid samples provided. ' + 'Number of folds for all_smpls and all_smpls_cluster must be the same.') + for i_cluster in range(dml_data.n_cluster_vars): + this_cluster_var = dml_data.cluster_vars[:, i_cluster] + clusters = np.unique(this_cluster_var) + cluster_partition = [all_smpls_cluster[0][0][0][i_cluster], all_smpls_cluster[0][0][1][i_cluster]] + is_cluster_partition = _check_cluster_partitions(cluster_partition, clusters) + if not is_cluster_partition: + raise ValueError('Invalid cluster partition provided. ' + 'At least one inner list does not form a partition.') + + smpls_cluster = all_smpls_cluster + return smpls_cluster + + +def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data): + if isinstance(all_smpls, tuple): if not len(all_smpls) == 2: raise ValueError('Invalid partition provided. ' 'Tuple for train_ind and test_ind must consist of exactly two elements.') - all_smpls = _check_smpl_split_tpl(all_smpls, n_obs) - if (_check_is_partition([all_smpls], n_obs) & - _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs)): + all_smpls = _check_smpl_split_tpl(all_smpls, dml_data.n_obs) + if (_check_is_partition([all_smpls], dml_data.n_obs) & + _check_is_partition([(all_smpls[1], all_smpls[0])], dml_data.n_obs)): n_rep = 1 n_folds = 1 smpls = [[all_smpls]] @@ -380,15 +414,15 @@ def _check_sample_splitting(all_smpls, n_obs, is_cluster_data): raise ValueError('Invalid partition provided. ' 'All tuples for train_ind and test_ind must consist of exactly two elements.') n_rep = 1 - all_smpls = _check_smpl_split(all_smpls, n_obs) - if _check_is_partition(all_smpls, n_obs): + all_smpls = _check_smpl_split(all_smpls, dml_data.n_obs) + if _check_is_partition(all_smpls, dml_data.n_obs): if ((len(all_smpls) == 1) & - _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs)): + _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], dml_data.n_obs)): n_folds = 1 smpls = [all_smpls] else: n_folds = len(all_smpls) - smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True) + smpls = _check_all_smpls([all_smpls], dml_data.n_obs, check_intersect=True) else: raise ValueError('Invalid partition provided. ' 'Tuples provided that don\'t form a partition.') @@ -409,15 +443,20 @@ def _check_sample_splitting(all_smpls, n_obs, is_cluster_data): if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]): raise ValueError('Invalid partition provided. ' 'Different number of folds for repeated sample splitting.') - all_smpls = _check_all_smpls(all_smpls, n_obs) - smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls] + all_smpls = _check_all_smpls(all_smpls, dml_data.n_obs) + smpls_are_partitions = [_check_is_partition(smpl, dml_data.n_obs) for smpl in all_smpls] if all(smpls_are_partitions): n_rep = len(all_smpls) n_folds = n_folds_each_smpl[0] - smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True) + smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True) else: raise ValueError('Invalid partition provided. ' 'At least one inner list does not form a partition.') - return smpls, n_rep, n_folds + if is_cluster_data: + smpls_cluster = _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds) + else: + smpls_cluster = None + + return smpls, smpls_cluster, n_rep, n_folds From b33de373ff48467d02bab3805be140705ef454a3 Mon Sep 17 00:00:00 2001 From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:13:28 +0200 Subject: [PATCH 4/5] add tests for set_sample_splitting and cluster data --- doubleml/double_ml.py | 10 +--- doubleml/tests/test_exceptions.py | 3 - doubleml/tests/test_multiway_cluster.py | 74 ++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index dde61d285..e079c9a61 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -289,11 +289,8 @@ def smpls(self): The partition used for cross-fitting. """ if self._smpls is None: - if self._is_cluster_data: - err_msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().' - else: - err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' + - 'or set external samples via .set_sample_splitting().') + err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' + + 'or set external samples via .set_sample_splitting().') raise ValueError(err_msg) return self._smpls @@ -302,9 +299,6 @@ def smpls_cluster(self): """ The partition of clusters used for cross-fitting. """ - if self._is_cluster_data: - if self._smpls_cluster is None: - raise ValueError('Sample splitting not specified. Draw samples via .draw_sample splitting().') return self._smpls_cluster @property diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 4bd11d939..8743b45e6 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -646,10 +646,7 @@ def test_doubleml_exception_smpls(): dml_plr_no_smpls = DoubleMLPLR(dml_data, ml_l, ml_m, draw_sample_splitting=False) with pytest.raises(ValueError, match=msg): _ = dml_plr_no_smpls.smpls - msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().' dml_pliv_cluster_no_smpls = DoubleMLPLIV(dml_cluster_data_pliv, ml_l, ml_m, ml_r, draw_sample_splitting=False) - with pytest.raises(ValueError, match=msg): - _ = dml_pliv_cluster_no_smpls.smpls_cluster with pytest.raises(ValueError, match=msg): _ = dml_pliv_cluster_no_smpls.smpls diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index 1dbff111f..85980f8f0 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -68,6 +68,21 @@ def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score): np.random.seed(3141) dml_pliv_obj.fit() + dml_pliv_obj_ext_smpls = dml.DoubleMLPLIV( + obj_dml_cluster_data, + ml_l, ml_m, ml_r, ml_g, + n_folds=n_folds, + n_rep=n_rep, + score=score, + draw_sample_splitting=False) + + dml_pliv_obj_ext_smpls.set_sample_splitting( + all_smpls=dml_pliv_obj.smpls, + all_smpls_cluster=dml_pliv_obj.smpls_cluster) + + np.random.seed(3141) + dml_pliv_obj_ext_smpls.fit() + np.random.seed(3141) y = obj_dml_cluster_data.y x = obj_dml_cluster_data.x @@ -126,7 +141,9 @@ def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score): res_dict = {'coef': dml_pliv_obj.coef, 'se': dml_pliv_obj.se, 'coef_manual': theta, - 'se_manual': se} + 'se_manual': se, + 'coef_ext_smpls': dml_pliv_obj_ext_smpls.coef, + 'se_ext_smpls': dml_pliv_obj_ext_smpls.se} return res_dict @@ -136,6 +153,9 @@ def test_dml_pliv_multiway_cluster_coef(dml_pliv_multiway_cluster_fixture): assert math.isclose(dml_pliv_multiway_cluster_fixture['coef'][0], dml_pliv_multiway_cluster_fixture['coef_manual'], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_pliv_multiway_cluster_fixture['coef'][0], + dml_pliv_multiway_cluster_fixture['coef_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) @pytest.mark.ci @@ -143,6 +163,9 @@ def test_dml_pliv_multiway_cluster_se(dml_pliv_multiway_cluster_fixture): assert math.isclose(dml_pliv_multiway_cluster_fixture['se'][0], dml_pliv_multiway_cluster_fixture['se_manual'], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_pliv_multiway_cluster_fixture['se'][0], + dml_pliv_multiway_cluster_fixture['se_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) @pytest.fixture(scope='module') @@ -167,6 +190,20 @@ def dml_pliv_oneway_cluster_fixture(generate_data_iv, learner, score): np.random.seed(3141) dml_pliv_obj.fit() + dml_pliv_obj_ext_smpls = dml.DoubleMLPLIV( + obj_dml_oneway_cluster_data, + ml_l, ml_m, ml_r, ml_g, + n_folds=n_folds, + score=score, + draw_sample_splitting=False) + + dml_pliv_obj_ext_smpls.set_sample_splitting( + all_smpls=dml_pliv_obj.smpls, + all_smpls_cluster=dml_pliv_obj.smpls_cluster) + + np.random.seed(3141) + dml_pliv_obj_ext_smpls.fit() + np.random.seed(3141) y = obj_dml_oneway_cluster_data.y x = obj_dml_oneway_cluster_data.x @@ -210,7 +247,9 @@ def dml_pliv_oneway_cluster_fixture(generate_data_iv, learner, score): res_dict = {'coef': dml_pliv_obj.coef, 'se': dml_pliv_obj.se, 'coef_manual': theta, - 'se_manual': se} + 'se_manual': se, + 'coef_ext_smpls': dml_pliv_obj_ext_smpls.coef, + 'se_ext_smpls': dml_pliv_obj_ext_smpls.se} return res_dict @@ -220,6 +259,9 @@ def test_dml_pliv_oneway_cluster_coef(dml_pliv_oneway_cluster_fixture): assert math.isclose(dml_pliv_oneway_cluster_fixture['coef'][0], dml_pliv_oneway_cluster_fixture['coef_manual'], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_pliv_oneway_cluster_fixture['coef'][0], + dml_pliv_oneway_cluster_fixture['coef_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) @pytest.mark.ci @@ -227,6 +269,9 @@ def test_dml_pliv_oneway_cluster_se(dml_pliv_oneway_cluster_fixture): assert math.isclose(dml_pliv_oneway_cluster_fixture['se'][0], dml_pliv_oneway_cluster_fixture['se_manual'], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_pliv_oneway_cluster_fixture['se'][0], + dml_pliv_oneway_cluster_fixture['se_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) @pytest.fixture(scope="module") @@ -247,6 +292,7 @@ def dml_plr_cluster_with_index(generate_data1, learner): dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_l, ml_m, n_folds=n_folds) + np.random.seed(3141) dml_plr_obj.fit() df = data.reset_index() @@ -259,12 +305,28 @@ def dml_plr_cluster_with_index(generate_data1, learner): dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) + np.random.seed(3141) dml_plr_cluster_obj.fit() + dml_plr_cluster_ext_smpls = dml.DoubleMLPLR( + dml_cluster_data, + ml_l, ml_m, + n_folds=n_folds, + draw_sample_splitting=False) + + dml_plr_cluster_ext_smpls.set_sample_splitting( + all_smpls=dml_plr_cluster_obj.smpls, + all_smpls_cluster=dml_plr_cluster_obj.smpls_cluster) + + np.random.seed(3141) + dml_plr_cluster_ext_smpls.fit() + res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': dml_plr_cluster_obj.coef, 'se': dml_plr_obj.se, - 'se_manual': dml_plr_cluster_obj.se} + 'se_manual': dml_plr_cluster_obj.se, + 'coef_ext_smpls': dml_plr_cluster_ext_smpls.coef, + 'se_ext_smpls': dml_plr_cluster_ext_smpls.se} return res_dict @@ -274,6 +336,9 @@ def test_dml_plr_cluster_with_index_coef(dml_plr_cluster_with_index): assert math.isclose(dml_plr_cluster_with_index['coef'][0], dml_plr_cluster_with_index['coef_manual'][0], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_cluster_with_index['coef'][0], + dml_plr_cluster_with_index['coef_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) @pytest.mark.ci @@ -281,3 +346,6 @@ def test_dml_plr_cluster_with_index_se(dml_plr_cluster_with_index): assert math.isclose(dml_plr_cluster_with_index['se'][0], dml_plr_cluster_with_index['se_manual'][0], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_cluster_with_index['se'][0], + dml_plr_cluster_with_index['se_ext_smpls'][0], + rel_tol=1e-9, abs_tol=1e-4) From 939fc8b3e623df2c75c72c5eeef6152a80f77615 Mon Sep 17 00:00:00 2001 From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:40:19 +0200 Subject: [PATCH 5/5] add set_sample_splitting to qte --- doubleml/irm/qte.py | 85 ++++++++++++++++++++++++++++++---- doubleml/irm/tests/test_qte.py | 46 ++++++++++++++---- doubleml/utils/_checks.py | 2 +- 3 files changed, 113 insertions(+), 20 deletions(-) diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index 9fd220f19..17894a600 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -13,7 +13,7 @@ from ..utils._estimation import _default_kde from ..utils.resampling import DoubleMLResampling -from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment +from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_sample_splitting class DoubleMLQTE: @@ -143,16 +143,15 @@ def __init__(self, raise TypeError('Normalization indicator has to be boolean. ' + f'Object of type {str(type(self.normalize_ipw))} passed.') + self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)} + self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'} + # perform sample splitting self._smpls = None if draw_sample_splitting: self.draw_sample_splitting() - - self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)} - self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'} - - # initialize all models - self._modellist_0, self._modellist_1 = self._initialize_models() + # initialize all models + self._modellist_0, self._modellist_1 = self._initialize_models() def __str__(self): class_name = self.__class__.__name__ @@ -204,8 +203,8 @@ def smpls(self): The partition used for cross-fitting. """ if self._smpls is None: - err_msg = ('Sample splitting not specified. Draw samples via .draw_sample splitting(). ' + - 'External samples not implemented yet.') + err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' + + 'or set external samples via .set_sample_splitting().') raise ValueError(err_msg) return self._smpls @@ -465,6 +464,74 @@ def draw_sample_splitting(self): n_obs=self._dml_data.n_obs, stratify=self._dml_data.d) self._smpls = obj_dml_resampling.split_samples() + # initialize all models + self._modellist_0, self._modellist_1 = self._initialize_models() + + return self + + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): + """ + Set the sample splitting for DoubleML models. + + The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition. + + Parameters + ---------- + all_smpls : list or tuple + If nested list of lists of tuples: + The outer list needs to provide an entry per repeated sample splitting (length of list is set as + ``n_rep``). + The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as + ``n_folds``). test_ind must form a partition for each inner list. + If list of tuples: + The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as + ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set. + If tuple: + Must be a tuple with two elements train_ind and test_ind. Only viable option is to set + train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. + ``n_folds=1`` and ``n_rep=1`` is always set. + + all_smpls_cluster : list or None + Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level + of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and + testing lists. Both training and testing contain an array for each cluster variable, which form a partition of + the clusters. + Default is ``None``. + + Returns + ------- + self : object + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10) + >>> ml_g = learner + >>> ml_m = learner + >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) + >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) + >>> dml_plr_obj.set_sample_splitting(smpls) + >>> # sample splitting with two folds and cross-fitting + >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), + >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] + >>> dml_plr_obj.set_sample_splitting(smpls) + >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2 + >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), + >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], + >>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), + >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] + >>> dml_plr_obj.set_sample_splitting(smpls) + """ + self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( + all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data) + + # initialize all models + self._modellist_0, self._modellist_1 = self._initialize_models() return self diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py index 45496dd6c..bdcd695da 100644 --- a/doubleml/irm/tests/test_qte.py +++ b/doubleml/irm/tests/test_qte.py @@ -54,18 +54,36 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde): ml_g = clone(learner) ml_m = clone(learner) + input_args = { + "quantiles": quantiles, + "n_folds": n_folds, + "n_rep": n_rep, + "normalize_ipw": normalize_ipw, + "trimming_threshold": 1e-12, + "kde": kde + } + np.random.seed(42) - dml_qte_obj = dml.DoubleMLQTE(obj_dml_data, - ml_g, ml_m, - quantiles=quantiles, - n_folds=n_folds, - n_rep=n_rep, - normalize_ipw=normalize_ipw, - trimming_threshold=1e-12, - kde=kde) + dml_qte_obj = dml.DoubleMLQTE( + obj_dml_data, + ml_g, ml_m, + **input_args + ) unfitted_qte_model = copy.copy(dml_qte_obj) + np.random.seed(42) dml_qte_obj.fit() + np.random.seed(42) + dml_qte_obj_ext_smpls = dml.DoubleMLQTE( + obj_dml_data, + ml_g, ml_m, + draw_sample_splitting=False, + **input_args + ) + dml_qte_obj_ext_smpls.set_sample_splitting(dml_qte_obj.smpls) + np.random.seed(42) + dml_qte_obj_ext_smpls.fit() + np.random.seed(42) n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d) @@ -80,8 +98,10 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde): boot_t_stat=None, joint=False, level=0.95) res_dict = {'coef': dml_qte_obj.coef, 'coef_manual': res_manual['qte'], + 'coef_ext_smpls': dml_qte_obj_ext_smpls.coef, 'se': dml_qte_obj.se, 'se_manual': res_manual['se'], + 'se_ext_smpls': dml_qte_obj_ext_smpls.se, 'boot_methods': boot_methods, 'ci': ci.to_numpy(), 'ci_manual': ci_manual.to_numpy(), @@ -112,6 +132,9 @@ def test_dml_qte_coef(dml_qte_fixture): assert np.allclose(dml_qte_fixture['coef'], dml_qte_fixture['coef_manual'], rtol=1e-9, atol=1e-4) + assert np.allclose(dml_qte_fixture['coef'], + dml_qte_fixture['coef_ext_smpls'], + rtol=1e-9, atol=1e-4) @pytest.mark.ci @@ -119,6 +142,9 @@ def test_dml_qte_se(dml_qte_fixture): assert np.allclose(dml_qte_fixture['se'], dml_qte_fixture['se_manual'], rtol=1e-9, atol=1e-4) + assert np.allclose(dml_qte_fixture['se'], + dml_qte_fixture['se_ext_smpls'], + rtol=1e-9, atol=1e-4) @pytest.mark.ci @@ -148,8 +174,8 @@ def test_doubleml_qte_exceptions(): ml_g = RandomForestClassifier(n_estimators=20) ml_m = RandomForestClassifier(n_estimators=20) - msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). ' \ - 'External samples not implemented yet.' + msg = ('Sample splitting not specified. ' + r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).') with pytest.raises(ValueError, match=msg): dml_obj = dml.DoubleMLQTE(obj_dml_data, ml_g, ml_m, draw_sample_splitting=False) _ = dml_obj.smpls diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 7ecadebd4..a5a671103 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -448,7 +448,7 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d if all(smpls_are_partitions): n_rep = len(all_smpls) - n_folds = n_folds_each_smpl[0] + n_folds = int(n_folds_each_smpl[0]) smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True) else: raise ValueError('Invalid partition provided. '