DoubleML · SvenKlaassen · Jul 22, 2024 · Jul 19, 2024 · Jul 19, 2024 · Jul 22, 2024
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
@@ -15,8 +15,8 @@
 
 from .utils.resampling import DoubleMLResampling, DoubleMLClusterResampling
 from .utils._estimation import _rmse, _aggregate_coefs_and_ses, _var_est, _set_external_predictions
-from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, _check_is_partition, \
-    _check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _check_benchmarks, _check_external_predictions
+from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, \
+    _check_benchmarks, _check_external_predictions, _check_sample_splitting
 from .utils._plots import _sensitivity_contour_plot
 from .utils.gain_statistics import gain_statistics
 
@@ -289,11 +289,8 @@ def smpls(self):
         The partition used for cross-fitting.
         """
         if self._smpls is None:
-            if self._is_cluster_data:
-                err_msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().'
-            else:
-                err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
-                           'or set external samples via .set_sample_splitting().')
+            err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
+                       'or set external samples via .set_sample_splitting().')
             raise ValueError(err_msg)
         return self._smpls
 
@@ -302,9 +299,6 @@ def smpls_cluster(self):
         """
         The partition of clusters used for cross-fitting.
         """
-        if self._is_cluster_data:
-            if self._smpls_cluster is None:
-                raise ValueError('Sample splitting not specified. Draw samples via .draw_sample splitting().')
         return self._smpls_cluster
 
     @property
@@ -1155,7 +1149,7 @@ def draw_sample_splitting(self):
 
         return self
 
-    def set_sample_splitting(self, all_smpls):
+    def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
         """
         Set the sample splitting for DoubleML models.
 
@@ -1177,6 +1171,13 @@ def set_sample_splitting(self, all_smpls):
                 train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
                 ``n_folds=1`` and ``n_rep=1`` is always set.
 
+        all_smpls_cluster : list or None
+            Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
+            of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
+            testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
+            the clusters.
+            Default is ``None``.
+
         Returns
         -------
         self : object
@@ -1194,8 +1195,6 @@ def set_sample_splitting(self, all_smpls):
         >>> ml_m = learner
         >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
         >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
-        >>> # simple sample splitting with two folds and without cross-fitting
-        >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])
         >>> dml_plr_obj.set_sample_splitting(smpls)
         >>> # sample splitting with two folds and cross-fitting
         >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
@@ -1208,71 +1207,8 @@ def set_sample_splitting(self, all_smpls):
         >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
-        if self._is_cluster_data:
-            raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
-                                      'not yet implemented with clustering.')
-        if isinstance(all_smpls, tuple):
-            if not len(all_smpls) == 2:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple for train_ind and test_ind must consist of exactly two elements.')
-            all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
-            if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
-                    _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
-                self._n_rep = 1
-                self._n_folds = 1
-                self._smpls = [[all_smpls]]
-            else:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple provided that doesn\'t form a partition.')
-        else:
-            if not isinstance(all_smpls, list):
-                raise TypeError('all_smpls must be of list or tuple type. '
-                                f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
-            all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
-            if all_tuple:
-                if not all([len(tpl) == 2 for tpl in all_smpls]):
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                self._n_rep = 1
-                all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
-                if _check_is_partition(all_smpls, self._dml_data.n_obs):
-                    if ((len(all_smpls) == 1) &
-                            _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
-                        self._n_folds = 1
-                        self._smpls = [all_smpls]
-                    else:
-                        self._n_folds = len(all_smpls)
-                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'Tuples provided that don\'t form a partition.')
-            else:
-                all_list = all([isinstance(smpl, list) for smpl in all_smpls])
-                if not all_list:
-                    raise ValueError('Invalid partition provided. '
-                                     'all_smpls is a list where neither all elements are tuples '
-                                     'nor all elements are lists.')
-                all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
-                if not all_tuple:
-                    raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
-                all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
-                if not all_pairs:
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
-                if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
-                    raise ValueError('Invalid partition provided. '
-                                     'Different number of folds for repeated sample splitting.')
-                all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
-                smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]
-
-                if all(smpls_are_partitions):
-                    self._n_rep = len(all_smpls)
-                    self._n_folds = n_folds_each_smpl[0]
-                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'At least one inner list does not form a partition.')
+        self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)
 
         self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \
             self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays()

diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
@@ -13,7 +13,7 @@
 
 from ..utils._estimation import _default_kde
 from ..utils.resampling import DoubleMLResampling
-from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment
+from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_sample_splitting
 
 
 class DoubleMLQTE:
@@ -143,16 +143,15 @@ def __init__(self,
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')
 
+        self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
+        self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
+
         # perform sample splitting
         self._smpls = None
         if draw_sample_splitting:
             self.draw_sample_splitting()
-
-        self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
-        self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
-
-        # initialize all models
-        self._modellist_0, self._modellist_1 = self._initialize_models()
+            # initialize all models
+            self._modellist_0, self._modellist_1 = self._initialize_models()
 
     def __str__(self):
         class_name = self.__class__.__name__
@@ -204,8 +203,8 @@ def smpls(self):
         The partition used for cross-fitting.
         """
         if self._smpls is None:
-            err_msg = ('Sample splitting not specified. Draw samples via .draw_sample splitting(). ' +
-                       'External samples not implemented yet.')
+            err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
+                       'or set external samples via .set_sample_splitting().')
             raise ValueError(err_msg)
         return self._smpls
 
@@ -465,6 +464,74 @@ def draw_sample_splitting(self):
                                                 n_obs=self._dml_data.n_obs,
                                                 stratify=self._dml_data.d)
         self._smpls = obj_dml_resampling.split_samples()
+        # initialize all models
+        self._modellist_0, self._modellist_1 = self._initialize_models()
+
+        return self
+
+    def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
+        """
+        Set the sample splitting for DoubleML models.
+
+        The  attributes ``n_folds`` and ``n_rep`` are derived from the provided partition.
+
+        Parameters
+        ----------
+        all_smpls : list or tuple
+            If nested list of lists of tuples:
+                The outer list needs to provide an entry per repeated sample splitting (length of list is set as
+                ``n_rep``).
+                The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition for each inner list.
+            If list of tuples:
+                The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set.
+            If tuple:
+                Must be a tuple with two elements train_ind and test_ind. Only viable option is to set
+                train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
+                ``n_folds=1`` and ``n_rep=1`` is always set.
+
+        all_smpls_cluster : list or None
+            Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
+            of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
+            testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
+            the clusters.
+            Default is ``None``.
+
+        Returns
+        -------
+        self : object
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import doubleml as dml
+        >>> from doubleml.datasets import make_plr_CCDDHNR2018
+        >>> from sklearn.ensemble import RandomForestRegressor
+        >>> from sklearn.base import clone
+        >>> np.random.seed(3141)
+        >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10)
+        >>> ml_g = learner
+        >>> ml_m = learner
+        >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
+        >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        >>> # sample splitting with two folds and cross-fitting
+        >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>          ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2
+        >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>           ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+        >>>          [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+        >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        """
+        self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)
+
+        # initialize all models
+        self._modellist_0, self._modellist_1 = self._initialize_models()
 
         return self
 

diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py
@@ -54,18 +54,36 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
     ml_g = clone(learner)
     ml_m = clone(learner)
 
+    input_args = {
+        "quantiles": quantiles,
+        "n_folds": n_folds,
+        "n_rep": n_rep,
+        "normalize_ipw": normalize_ipw,
+        "trimming_threshold": 1e-12,
+        "kde": kde
+    }
+
     np.random.seed(42)
-    dml_qte_obj = dml.DoubleMLQTE(obj_dml_data,
-                                  ml_g, ml_m,
-                                  quantiles=quantiles,
-                                  n_folds=n_folds,
-                                  n_rep=n_rep,
-                                  normalize_ipw=normalize_ipw,
-                                  trimming_threshold=1e-12,
-                                  kde=kde)
+    dml_qte_obj = dml.DoubleMLQTE(
+        obj_dml_data,
+        ml_g, ml_m,
+        **input_args
+    )
     unfitted_qte_model = copy.copy(dml_qte_obj)
+    np.random.seed(42)
     dml_qte_obj.fit()
 
+    np.random.seed(42)
+    dml_qte_obj_ext_smpls = dml.DoubleMLQTE(
+        obj_dml_data,
+        ml_g, ml_m,
+        draw_sample_splitting=False,
+        **input_args
+    )
+    dml_qte_obj_ext_smpls.set_sample_splitting(dml_qte_obj.smpls)
+    np.random.seed(42)
+    dml_qte_obj_ext_smpls.fit()
+
     np.random.seed(42)
     n_obs = len(y)
     all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
@@ -80,8 +98,10 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
                             boot_t_stat=None, joint=False, level=0.95)
     res_dict = {'coef': dml_qte_obj.coef,
                 'coef_manual': res_manual['qte'],
+                'coef_ext_smpls': dml_qte_obj_ext_smpls.coef,
                 'se': dml_qte_obj.se,
                 'se_manual': res_manual['se'],
+                'se_ext_smpls': dml_qte_obj_ext_smpls.se,
                 'boot_methods': boot_methods,
                 'ci': ci.to_numpy(),
                 'ci_manual': ci_manual.to_numpy(),
@@ -112,13 +132,19 @@ def test_dml_qte_coef(dml_qte_fixture):
     assert np.allclose(dml_qte_fixture['coef'],
                        dml_qte_fixture['coef_manual'],
                        rtol=1e-9, atol=1e-4)
+    assert np.allclose(dml_qte_fixture['coef'],
+                       dml_qte_fixture['coef_ext_smpls'],
+                       rtol=1e-9, atol=1e-4)
 
 
 @pytest.mark.ci
 def test_dml_qte_se(dml_qte_fixture):
     assert np.allclose(dml_qte_fixture['se'],
                        dml_qte_fixture['se_manual'],
                        rtol=1e-9, atol=1e-4)
+    assert np.allclose(dml_qte_fixture['se'],
+                       dml_qte_fixture['se_ext_smpls'],
+                       rtol=1e-9, atol=1e-4)
 
 
 @pytest.mark.ci
@@ -148,8 +174,8 @@ def test_doubleml_qte_exceptions():
     ml_g = RandomForestClassifier(n_estimators=20)
     ml_m = RandomForestClassifier(n_estimators=20)
 
-    msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). ' \
-          'External samples not implemented yet.'
+    msg = ('Sample splitting not specified. '
+           r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).')
     with pytest.raises(ValueError, match=msg):
         dml_obj = dml.DoubleMLQTE(obj_dml_data, ml_g, ml_m, draw_sample_splitting=False)
         _ = dml_obj.smpls