From c61b1e4970d975402a4a19b95f4797c0de28e751 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 19 Jul 2024 08:27:50 +0200
Subject: [PATCH 1/5] update set_sample_splitting documentation

---
 doubleml/double_ml.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 5fe6decf2..00d64d8f8 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1194,8 +1194,6 @@ def set_sample_splitting(self, all_smpls):
         >>> ml_m = learner
         >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
         >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
-        >>> # simple sample splitting with two folds and without cross-fitting
-        >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])
         >>> dml_plr_obj.set_sample_splitting(smpls)
         >>> # sample splitting with two folds and cross-fitting
         >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),

From 02abf21e0e5c6ab620c6c5c48ec3ce8c9e8a057d Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:38:36 +0200
Subject: [PATCH 2/5] add set_sample_splitting to _checks to simplify doubleml
 class

---
 doubleml/double_ml.py     | 71 +++------------------------------------
 doubleml/utils/_checks.py | 70 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 67 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 00d64d8f8..a77cc7b06 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -15,8 +15,8 @@
 
 from .utils.resampling import DoubleMLResampling, DoubleMLClusterResampling
 from .utils._estimation import _rmse, _aggregate_coefs_and_ses, _var_est, _set_external_predictions
-from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, _check_is_partition, \
-    _check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _check_benchmarks, _check_external_predictions
+from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, \
+    _check_benchmarks, _check_external_predictions, _check_sample_splitting
 from .utils._plots import _sensitivity_contour_plot
 from .utils.gain_statistics import gain_statistics
 
@@ -1206,71 +1206,8 @@ def set_sample_splitting(self, all_smpls):
         >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
-        if self._is_cluster_data:
-            raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
-                                      'not yet implemented with clustering.')
-        if isinstance(all_smpls, tuple):
-            if not len(all_smpls) == 2:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple for train_ind and test_ind must consist of exactly two elements.')
-            all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
-            if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
-                    _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
-                self._n_rep = 1
-                self._n_folds = 1
-                self._smpls = [[all_smpls]]
-            else:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple provided that doesn\'t form a partition.')
-        else:
-            if not isinstance(all_smpls, list):
-                raise TypeError('all_smpls must be of list or tuple type. '
-                                f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
-            all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
-            if all_tuple:
-                if not all([len(tpl) == 2 for tpl in all_smpls]):
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                self._n_rep = 1
-                all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
-                if _check_is_partition(all_smpls, self._dml_data.n_obs):
-                    if ((len(all_smpls) == 1) &
-                            _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
-                        self._n_folds = 1
-                        self._smpls = [all_smpls]
-                    else:
-                        self._n_folds = len(all_smpls)
-                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'Tuples provided that don\'t form a partition.')
-            else:
-                all_list = all([isinstance(smpl, list) for smpl in all_smpls])
-                if not all_list:
-                    raise ValueError('Invalid partition provided. '
-                                     'all_smpls is a list where neither all elements are tuples '
-                                     'nor all elements are lists.')
-                all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
-                if not all_tuple:
-                    raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
-                all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
-                if not all_pairs:
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
-                if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
-                    raise ValueError('Invalid partition provided. '
-                                     'Different number of folds for repeated sample splitting.')
-                all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
-                smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]
-
-                if all(smpls_are_partitions):
-                    self._n_rep = len(all_smpls)
-                    self._n_folds = n_folds_each_smpl[0]
-                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'At least one inner list does not form a partition.')
+        self._smpls, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, self._dml_data.n_obs, self._is_cluster_data)
 
         self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \
             self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays()
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index aa736a0af..e153b3992 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -351,3 +351,73 @@ def _check_framework_compatibility(dml_framework_1, dml_framework_2, check_treat
 
 def _check_set(x):
     return {x} if x is not None else {}
+
+
+def _check_sample_splitting(all_smpls, n_obs, is_cluster_data):
+    if is_cluster_data:
+        raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
+                                  'not yet implemented with clustering.')
+    if isinstance(all_smpls, tuple):
+        if not len(all_smpls) == 2:
+            raise ValueError('Invalid partition provided. '
+                             'Tuple for train_ind and test_ind must consist of exactly two elements.')
+        all_smpls = _check_smpl_split_tpl(all_smpls, n_obs)
+        if (_check_is_partition([all_smpls], n_obs) &
+                _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs)):
+            n_rep = 1
+            n_folds = 1
+            smpls = [[all_smpls]]
+        else:
+            raise ValueError('Invalid partition provided. '
+                             'Tuple provided that doesn\'t form a partition.')
+    else:
+        if not isinstance(all_smpls, list):
+            raise TypeError('all_smpls must be of list or tuple type. '
+                            f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
+        all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
+        if all_tuple:
+            if not all([len(tpl) == 2 for tpl in all_smpls]):
+                raise ValueError('Invalid partition provided. '
+                                 'All tuples for train_ind and test_ind must consist of exactly two elements.')
+            n_rep = 1
+            all_smpls = _check_smpl_split(all_smpls, n_obs)
+            if _check_is_partition(all_smpls, n_obs):
+                if ((len(all_smpls) == 1) &
+                        _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs)):
+                    n_folds = 1
+                    smpls = [all_smpls]
+                else:
+                    n_folds = len(all_smpls)
+                    smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True)
+            else:
+                raise ValueError('Invalid partition provided. '
+                                 'Tuples provided that don\'t form a partition.')
+        else:
+            all_list = all([isinstance(smpl, list) for smpl in all_smpls])
+            if not all_list:
+                raise ValueError('Invalid partition provided. '
+                                 'all_smpls is a list where neither all elements are tuples '
+                                 'nor all elements are lists.')
+            all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
+            if not all_tuple:
+                raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
+            all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
+            if not all_pairs:
+                raise ValueError('Invalid partition provided. '
+                                 'All tuples for train_ind and test_ind must consist of exactly two elements.')
+            n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
+            if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
+                raise ValueError('Invalid partition provided. '
+                                 'Different number of folds for repeated sample splitting.')
+            all_smpls = _check_all_smpls(all_smpls, n_obs)
+            smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls]
+
+            if all(smpls_are_partitions):
+                n_rep = len(all_smpls)
+                n_folds = n_folds_each_smpl[0]
+                smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True)
+            else:
+                raise ValueError('Invalid partition provided. '
+                                 'At least one inner list does not form a partition.')
+
+    return smpls, n_rep, n_folds

From 2ba36fde97e387c31d9b15197f7e595ca2d94305 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:00:36 +0200
Subject: [PATCH 3/5] add set_sample_splitting for doubleml with cluster data
 and exception tests

---
 doubleml/double_ml.py             | 13 ++++--
 doubleml/tests/test_exceptions.py | 45 +++++++++++++++++---
 doubleml/utils/_checks.py         | 69 ++++++++++++++++++++++++-------
 3 files changed, 103 insertions(+), 24 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index a77cc7b06..dde61d285 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1155,7 +1155,7 @@ def draw_sample_splitting(self):
 
         return self
 
-    def set_sample_splitting(self, all_smpls):
+    def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
         """
         Set the sample splitting for DoubleML models.
 
@@ -1177,6 +1177,13 @@ def set_sample_splitting(self, all_smpls):
                 train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
                 ``n_folds=1`` and ``n_rep=1`` is always set.
 
+        all_smpls_cluster : list or None
+            Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
+            of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
+            testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
+            the clusters.
+            Default is ``None``.
+
         Returns
         -------
         self : object
@@ -1206,8 +1213,8 @@ def set_sample_splitting(self, all_smpls):
         >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
-        self._smpls, self._n_rep, self._n_folds = _check_sample_splitting(
-            all_smpls, self._dml_data.n_obs, self._is_cluster_data)
+        self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)
 
         self._psi, self._psi_deriv, self._psi_elements, self._var_scaling_factors, \
             self._coef, self._se, self._all_coef, self._all_se = self._initialize_arrays()
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 84ac55a14..4bd11d939 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -1,6 +1,7 @@
 import pytest
 import pandas as pd
 import numpy as np
+import copy
 
 from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, \
     DoubleMLClusterData, DoubleMLPQ, DoubleMLLPQ, DoubleMLCVAR, DoubleMLQTE, DoubleMLDID, \
@@ -652,6 +653,44 @@ def test_doubleml_exception_smpls():
     with pytest.raises(ValueError, match=msg):
         _ = dml_pliv_cluster_no_smpls.smpls
 
+    dml_pliv_cluster = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r)
+    smpls = dml_plr.smpls
+    msg = ('For cluster data, all_smpls_cluster must be provided.')
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_pliv_cluster.set_sample_splitting(smpls)
+
+    all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster)
+    all_smpls_cluster.append(all_smpls_cluster[0])
+    msg = ('Invalid samples provided. Number of repetitions for all_smpls and all_smpls_cluster must be the same.')
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_pliv_cluster.set_sample_splitting(
+            all_smpls=dml_pliv_cluster.smpls,
+            all_smpls_cluster=all_smpls_cluster)
+
+    all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster)
+    all_smpls_cluster[0] = all_smpls_cluster[0][0]
+    msg = ('Invalid samples provided. Number of folds for all_smpls and all_smpls_cluster must be the same.')
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_pliv_cluster.set_sample_splitting(
+            all_smpls=dml_pliv_cluster.smpls,
+            all_smpls_cluster=all_smpls_cluster)
+
+    all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster)
+    all_smpls_cluster[0][0][1][1] = np.append(all_smpls_cluster[0][0][1][1], [11], axis=0)
+    msg = ('Invalid cluster partition provided. At least one inner list does not form a partition.')
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_pliv_cluster.set_sample_splitting(
+            all_smpls=dml_pliv_cluster.smpls,
+            all_smpls_cluster=all_smpls_cluster)
+
+    all_smpls_cluster = copy.deepcopy(dml_pliv_cluster.smpls_cluster)
+    all_smpls_cluster[0][0][1][1][1] = 11
+    msg = ('Invalid cluster partition provided. At least one inner list does not form a partition.')
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_pliv_cluster.set_sample_splitting(
+            all_smpls=dml_pliv_cluster.smpls,
+            all_smpls_cluster=all_smpls_cluster)
+
 
 @pytest.mark.ci
 def test_doubleml_exception_fit():
@@ -1212,12 +1251,6 @@ def test_doubleml_cluster_not_yet_implemented():
     with pytest.raises(NotImplementedError, match=msg):
         _ = dml_pliv_cluster.bootstrap()
 
-    smpls = dml_plr.smpls
-    msg = ('Externally setting the sample splitting for DoubleML is '
-           'not yet implemented with clustering.')
-    with pytest.raises(NotImplementedError, match=msg):
-        _ = dml_pliv_cluster.set_sample_splitting(smpls)
-
     df = dml_cluster_data_pliv.data.copy()
     df['cluster_var_k'] = df['cluster_var_i'] + df['cluster_var_j'] - 2
     dml_cluster_data_multiway = DoubleMLClusterData(df, y_col='Y', d_cols='D', x_cols=['X1', 'X5'], z_cols='Z',
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index e153b3992..7ecadebd4 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -353,17 +353,51 @@ def _check_set(x):
     return {x} if x is not None else {}
 
 
-def _check_sample_splitting(all_smpls, n_obs, is_cluster_data):
-    if is_cluster_data:
-        raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
-                                  'not yet implemented with clustering.')
+def _check_cluster_partitions(smpls, values):
+    test_indices = np.concatenate([test_index for test_index in smpls])
+    if len(test_indices) != len(values):
+        return False
+    if np.any(np.sort(test_indices) != np.sort(values)):
+        return False
+    return True
+
+
+def _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds):
+    if all_smpls_cluster is None:
+        raise ValueError('For cluster data, all_smpls_cluster must be provided.')
+
+    n_rep_cluster = len(all_smpls_cluster)
+    if n_rep_cluster != n_rep:
+        raise ValueError('Invalid samples provided. '
+                         'Number of repetitions for all_smpls and all_smpls_cluster must be the same.')
+
+    for i_rep in range(n_rep):
+        n_folds_cluster = len(all_smpls_cluster[i_rep])
+        if n_folds_cluster != n_folds:
+            raise ValueError('Invalid samples provided. '
+                             'Number of folds for all_smpls and all_smpls_cluster must be the same.')
+        for i_cluster in range(dml_data.n_cluster_vars):
+            this_cluster_var = dml_data.cluster_vars[:, i_cluster]
+            clusters = np.unique(this_cluster_var)
+            cluster_partition = [all_smpls_cluster[0][0][0][i_cluster], all_smpls_cluster[0][0][1][i_cluster]]
+            is_cluster_partition = _check_cluster_partitions(cluster_partition, clusters)
+            if not is_cluster_partition:
+                raise ValueError('Invalid cluster partition provided. '
+                                 'At least one inner list does not form a partition.')
+
+    smpls_cluster = all_smpls_cluster
+    return smpls_cluster
+
+
+def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data):
+
     if isinstance(all_smpls, tuple):
         if not len(all_smpls) == 2:
             raise ValueError('Invalid partition provided. '
                              'Tuple for train_ind and test_ind must consist of exactly two elements.')
-        all_smpls = _check_smpl_split_tpl(all_smpls, n_obs)
-        if (_check_is_partition([all_smpls], n_obs) &
-                _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs)):
+        all_smpls = _check_smpl_split_tpl(all_smpls, dml_data.n_obs)
+        if (_check_is_partition([all_smpls], dml_data.n_obs) &
+                _check_is_partition([(all_smpls[1], all_smpls[0])], dml_data.n_obs)):
             n_rep = 1
             n_folds = 1
             smpls = [[all_smpls]]
@@ -380,15 +414,15 @@ def _check_sample_splitting(all_smpls, n_obs, is_cluster_data):
                 raise ValueError('Invalid partition provided. '
                                  'All tuples for train_ind and test_ind must consist of exactly two elements.')
             n_rep = 1
-            all_smpls = _check_smpl_split(all_smpls, n_obs)
-            if _check_is_partition(all_smpls, n_obs):
+            all_smpls = _check_smpl_split(all_smpls, dml_data.n_obs)
+            if _check_is_partition(all_smpls, dml_data.n_obs):
                 if ((len(all_smpls) == 1) &
-                        _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs)):
+                        _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], dml_data.n_obs)):
                     n_folds = 1
                     smpls = [all_smpls]
                 else:
                     n_folds = len(all_smpls)
-                    smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True)
+                    smpls = _check_all_smpls([all_smpls], dml_data.n_obs, check_intersect=True)
             else:
                 raise ValueError('Invalid partition provided. '
                                  'Tuples provided that don\'t form a partition.')
@@ -409,15 +443,20 @@ def _check_sample_splitting(all_smpls, n_obs, is_cluster_data):
             if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
                 raise ValueError('Invalid partition provided. '
                                  'Different number of folds for repeated sample splitting.')
-            all_smpls = _check_all_smpls(all_smpls, n_obs)
-            smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls]
+            all_smpls = _check_all_smpls(all_smpls, dml_data.n_obs)
+            smpls_are_partitions = [_check_is_partition(smpl, dml_data.n_obs) for smpl in all_smpls]
 
             if all(smpls_are_partitions):
                 n_rep = len(all_smpls)
                 n_folds = n_folds_each_smpl[0]
-                smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True)
+                smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True)
             else:
                 raise ValueError('Invalid partition provided. '
                                  'At least one inner list does not form a partition.')
 
-    return smpls, n_rep, n_folds
+    if is_cluster_data:
+        smpls_cluster = _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds)
+    else:
+        smpls_cluster = None
+
+    return smpls, smpls_cluster, n_rep, n_folds

From b33de373ff48467d02bab3805be140705ef454a3 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:13:28 +0200
Subject: [PATCH 4/5] add tests for set_sample_splitting and cluster data

---
 doubleml/double_ml.py                   | 10 +---
 doubleml/tests/test_exceptions.py       |  3 -
 doubleml/tests/test_multiway_cluster.py | 74 ++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index dde61d285..e079c9a61 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -289,11 +289,8 @@ def smpls(self):
         The partition used for cross-fitting.
         """
         if self._smpls is None:
-            if self._is_cluster_data:
-                err_msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().'
-            else:
-                err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
-                           'or set external samples via .set_sample_splitting().')
+            err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
+                       'or set external samples via .set_sample_splitting().')
             raise ValueError(err_msg)
         return self._smpls
 
@@ -302,9 +299,6 @@ def smpls_cluster(self):
         """
         The partition of clusters used for cross-fitting.
         """
-        if self._is_cluster_data:
-            if self._smpls_cluster is None:
-                raise ValueError('Sample splitting not specified. Draw samples via .draw_sample splitting().')
         return self._smpls_cluster
 
     @property
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 4bd11d939..8743b45e6 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -646,10 +646,7 @@ def test_doubleml_exception_smpls():
     dml_plr_no_smpls = DoubleMLPLR(dml_data, ml_l, ml_m, draw_sample_splitting=False)
     with pytest.raises(ValueError, match=msg):
         _ = dml_plr_no_smpls.smpls
-    msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().'
     dml_pliv_cluster_no_smpls = DoubleMLPLIV(dml_cluster_data_pliv, ml_l, ml_m, ml_r, draw_sample_splitting=False)
-    with pytest.raises(ValueError, match=msg):
-        _ = dml_pliv_cluster_no_smpls.smpls_cluster
     with pytest.raises(ValueError, match=msg):
         _ = dml_pliv_cluster_no_smpls.smpls
 
diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py
index 1dbff111f..85980f8f0 100644
--- a/doubleml/tests/test_multiway_cluster.py
+++ b/doubleml/tests/test_multiway_cluster.py
@@ -68,6 +68,21 @@ def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score):
     np.random.seed(3141)
     dml_pliv_obj.fit()
 
+    dml_pliv_obj_ext_smpls = dml.DoubleMLPLIV(
+        obj_dml_cluster_data,
+        ml_l, ml_m, ml_r, ml_g,
+        n_folds=n_folds,
+        n_rep=n_rep,
+        score=score,
+        draw_sample_splitting=False)
+
+    dml_pliv_obj_ext_smpls.set_sample_splitting(
+        all_smpls=dml_pliv_obj.smpls,
+        all_smpls_cluster=dml_pliv_obj.smpls_cluster)
+
+    np.random.seed(3141)
+    dml_pliv_obj_ext_smpls.fit()
+
     np.random.seed(3141)
     y = obj_dml_cluster_data.y
     x = obj_dml_cluster_data.x
@@ -126,7 +141,9 @@ def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score):
     res_dict = {'coef': dml_pliv_obj.coef,
                 'se': dml_pliv_obj.se,
                 'coef_manual': theta,
-                'se_manual': se}
+                'se_manual': se,
+                'coef_ext_smpls': dml_pliv_obj_ext_smpls.coef,
+                'se_ext_smpls': dml_pliv_obj_ext_smpls.se}
 
     return res_dict
 
@@ -136,6 +153,9 @@ def test_dml_pliv_multiway_cluster_coef(dml_pliv_multiway_cluster_fixture):
     assert math.isclose(dml_pliv_multiway_cluster_fixture['coef'][0],
                         dml_pliv_multiway_cluster_fixture['coef_manual'],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_pliv_multiway_cluster_fixture['coef'][0],
+                        dml_pliv_multiway_cluster_fixture['coef_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
 
 
 @pytest.mark.ci
@@ -143,6 +163,9 @@ def test_dml_pliv_multiway_cluster_se(dml_pliv_multiway_cluster_fixture):
     assert math.isclose(dml_pliv_multiway_cluster_fixture['se'][0],
                         dml_pliv_multiway_cluster_fixture['se_manual'],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_pliv_multiway_cluster_fixture['se'][0],
+                        dml_pliv_multiway_cluster_fixture['se_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
 
 
 @pytest.fixture(scope='module')
@@ -167,6 +190,20 @@ def dml_pliv_oneway_cluster_fixture(generate_data_iv, learner, score):
     np.random.seed(3141)
     dml_pliv_obj.fit()
 
+    dml_pliv_obj_ext_smpls = dml.DoubleMLPLIV(
+        obj_dml_oneway_cluster_data,
+        ml_l, ml_m, ml_r, ml_g,
+        n_folds=n_folds,
+        score=score,
+        draw_sample_splitting=False)
+
+    dml_pliv_obj_ext_smpls.set_sample_splitting(
+        all_smpls=dml_pliv_obj.smpls,
+        all_smpls_cluster=dml_pliv_obj.smpls_cluster)
+
+    np.random.seed(3141)
+    dml_pliv_obj_ext_smpls.fit()
+
     np.random.seed(3141)
     y = obj_dml_oneway_cluster_data.y
     x = obj_dml_oneway_cluster_data.x
@@ -210,7 +247,9 @@ def dml_pliv_oneway_cluster_fixture(generate_data_iv, learner, score):
     res_dict = {'coef': dml_pliv_obj.coef,
                 'se': dml_pliv_obj.se,
                 'coef_manual': theta,
-                'se_manual': se}
+                'se_manual': se,
+                'coef_ext_smpls': dml_pliv_obj_ext_smpls.coef,
+                'se_ext_smpls': dml_pliv_obj_ext_smpls.se}
 
     return res_dict
 
@@ -220,6 +259,9 @@ def test_dml_pliv_oneway_cluster_coef(dml_pliv_oneway_cluster_fixture):
     assert math.isclose(dml_pliv_oneway_cluster_fixture['coef'][0],
                         dml_pliv_oneway_cluster_fixture['coef_manual'],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_pliv_oneway_cluster_fixture['coef'][0],
+                        dml_pliv_oneway_cluster_fixture['coef_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
 
 
 @pytest.mark.ci
@@ -227,6 +269,9 @@ def test_dml_pliv_oneway_cluster_se(dml_pliv_oneway_cluster_fixture):
     assert math.isclose(dml_pliv_oneway_cluster_fixture['se'][0],
                         dml_pliv_oneway_cluster_fixture['se_manual'],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_pliv_oneway_cluster_fixture['se'][0],
+                        dml_pliv_oneway_cluster_fixture['se_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
 
 
 @pytest.fixture(scope="module")
@@ -247,6 +292,7 @@ def dml_plr_cluster_with_index(generate_data1, learner):
     dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                   ml_l, ml_m,
                                   n_folds=n_folds)
+    np.random.seed(3141)
     dml_plr_obj.fit()
 
     df = data.reset_index()
@@ -259,12 +305,28 @@ def dml_plr_cluster_with_index(generate_data1, learner):
     dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data,
                                           ml_l, ml_m,
                                           n_folds=n_folds)
+    np.random.seed(3141)
     dml_plr_cluster_obj.fit()
 
+    dml_plr_cluster_ext_smpls = dml.DoubleMLPLR(
+        dml_cluster_data,
+        ml_l, ml_m,
+        n_folds=n_folds,
+        draw_sample_splitting=False)
+
+    dml_plr_cluster_ext_smpls.set_sample_splitting(
+        all_smpls=dml_plr_cluster_obj.smpls,
+        all_smpls_cluster=dml_plr_cluster_obj.smpls_cluster)
+
+    np.random.seed(3141)
+    dml_plr_cluster_ext_smpls.fit()
+
     res_dict = {'coef': dml_plr_obj.coef,
                 'coef_manual': dml_plr_cluster_obj.coef,
                 'se': dml_plr_obj.se,
-                'se_manual': dml_plr_cluster_obj.se}
+                'se_manual': dml_plr_cluster_obj.se,
+                'coef_ext_smpls': dml_plr_cluster_ext_smpls.coef,
+                'se_ext_smpls': dml_plr_cluster_ext_smpls.se}
 
     return res_dict
 
@@ -274,6 +336,9 @@ def test_dml_plr_cluster_with_index_coef(dml_plr_cluster_with_index):
     assert math.isclose(dml_plr_cluster_with_index['coef'][0],
                         dml_plr_cluster_with_index['coef_manual'][0],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_plr_cluster_with_index['coef'][0],
+                        dml_plr_cluster_with_index['coef_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
 
 
 @pytest.mark.ci
@@ -281,3 +346,6 @@ def test_dml_plr_cluster_with_index_se(dml_plr_cluster_with_index):
     assert math.isclose(dml_plr_cluster_with_index['se'][0],
                         dml_plr_cluster_with_index['se_manual'][0],
                         rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_plr_cluster_with_index['se'][0],
+                        dml_plr_cluster_with_index['se_ext_smpls'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)

From 939fc8b3e623df2c75c72c5eeef6152a80f77615 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:40:19 +0200
Subject: [PATCH 5/5] add set_sample_splitting to qte

---
 doubleml/irm/qte.py            | 85 ++++++++++++++++++++++++++++++----
 doubleml/irm/tests/test_qte.py | 46 ++++++++++++++----
 doubleml/utils/_checks.py      |  2 +-
 3 files changed, 113 insertions(+), 20 deletions(-)

diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index 9fd220f19..17894a600 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -13,7 +13,7 @@
 
 from ..utils._estimation import _default_kde
 from ..utils.resampling import DoubleMLResampling
-from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment
+from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_sample_splitting
 
 
 class DoubleMLQTE:
@@ -143,16 +143,15 @@ def __init__(self,
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')
 
+        self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
+        self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
+
         # perform sample splitting
         self._smpls = None
         if draw_sample_splitting:
             self.draw_sample_splitting()
-
-        self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
-        self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
-
-        # initialize all models
-        self._modellist_0, self._modellist_1 = self._initialize_models()
+            # initialize all models
+            self._modellist_0, self._modellist_1 = self._initialize_models()
 
     def __str__(self):
         class_name = self.__class__.__name__
@@ -204,8 +203,8 @@ def smpls(self):
         The partition used for cross-fitting.
         """
         if self._smpls is None:
-            err_msg = ('Sample splitting not specified. Draw samples via .draw_sample splitting(). ' +
-                       'External samples not implemented yet.')
+            err_msg = ('Sample splitting not specified. Either draw samples via .draw_sample splitting() ' +
+                       'or set external samples via .set_sample_splitting().')
             raise ValueError(err_msg)
         return self._smpls
 
@@ -465,6 +464,74 @@ def draw_sample_splitting(self):
                                                 n_obs=self._dml_data.n_obs,
                                                 stratify=self._dml_data.d)
         self._smpls = obj_dml_resampling.split_samples()
+        # initialize all models
+        self._modellist_0, self._modellist_1 = self._initialize_models()
+
+        return self
+
+    def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
+        """
+        Set the sample splitting for DoubleML models.
+
+        The  attributes ``n_folds`` and ``n_rep`` are derived from the provided partition.
+
+        Parameters
+        ----------
+        all_smpls : list or tuple
+            If nested list of lists of tuples:
+                The outer list needs to provide an entry per repeated sample splitting (length of list is set as
+                ``n_rep``).
+                The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition for each inner list.
+            If list of tuples:
+                The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set.
+            If tuple:
+                Must be a tuple with two elements train_ind and test_ind. Only viable option is to set
+                train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
+                ``n_folds=1`` and ``n_rep=1`` is always set.
+
+        all_smpls_cluster : list or None
+            Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level
+            of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and
+            testing lists. Both training and testing contain an array for each cluster variable, which form a partition of
+            the clusters.
+            Default is ``None``.
+
+        Returns
+        -------
+        self : object
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import doubleml as dml
+        >>> from doubleml.datasets import make_plr_CCDDHNR2018
+        >>> from sklearn.ensemble import RandomForestRegressor
+        >>> from sklearn.base import clone
+        >>> np.random.seed(3141)
+        >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10)
+        >>> ml_g = learner
+        >>> ml_m = learner
+        >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
+        >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        >>> # sample splitting with two folds and cross-fitting
+        >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>          ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2
+        >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>           ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+        >>>          [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+        >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        """
+        self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)
+
+        # initialize all models
+        self._modellist_0, self._modellist_1 = self._initialize_models()
 
         return self
 
diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py
index 45496dd6c..bdcd695da 100644
--- a/doubleml/irm/tests/test_qte.py
+++ b/doubleml/irm/tests/test_qte.py
@@ -54,18 +54,36 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
     ml_g = clone(learner)
     ml_m = clone(learner)
 
+    input_args = {
+        "quantiles": quantiles,
+        "n_folds": n_folds,
+        "n_rep": n_rep,
+        "normalize_ipw": normalize_ipw,
+        "trimming_threshold": 1e-12,
+        "kde": kde
+    }
+
     np.random.seed(42)
-    dml_qte_obj = dml.DoubleMLQTE(obj_dml_data,
-                                  ml_g, ml_m,
-                                  quantiles=quantiles,
-                                  n_folds=n_folds,
-                                  n_rep=n_rep,
-                                  normalize_ipw=normalize_ipw,
-                                  trimming_threshold=1e-12,
-                                  kde=kde)
+    dml_qte_obj = dml.DoubleMLQTE(
+        obj_dml_data,
+        ml_g, ml_m,
+        **input_args
+    )
     unfitted_qte_model = copy.copy(dml_qte_obj)
+    np.random.seed(42)
     dml_qte_obj.fit()
 
+    np.random.seed(42)
+    dml_qte_obj_ext_smpls = dml.DoubleMLQTE(
+        obj_dml_data,
+        ml_g, ml_m,
+        draw_sample_splitting=False,
+        **input_args
+    )
+    dml_qte_obj_ext_smpls.set_sample_splitting(dml_qte_obj.smpls)
+    np.random.seed(42)
+    dml_qte_obj_ext_smpls.fit()
+
     np.random.seed(42)
     n_obs = len(y)
     all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
@@ -80,8 +98,10 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
                             boot_t_stat=None, joint=False, level=0.95)
     res_dict = {'coef': dml_qte_obj.coef,
                 'coef_manual': res_manual['qte'],
+                'coef_ext_smpls': dml_qte_obj_ext_smpls.coef,
                 'se': dml_qte_obj.se,
                 'se_manual': res_manual['se'],
+                'se_ext_smpls': dml_qte_obj_ext_smpls.se,
                 'boot_methods': boot_methods,
                 'ci': ci.to_numpy(),
                 'ci_manual': ci_manual.to_numpy(),
@@ -112,6 +132,9 @@ def test_dml_qte_coef(dml_qte_fixture):
     assert np.allclose(dml_qte_fixture['coef'],
                        dml_qte_fixture['coef_manual'],
                        rtol=1e-9, atol=1e-4)
+    assert np.allclose(dml_qte_fixture['coef'],
+                       dml_qte_fixture['coef_ext_smpls'],
+                       rtol=1e-9, atol=1e-4)
 
 
 @pytest.mark.ci
@@ -119,6 +142,9 @@ def test_dml_qte_se(dml_qte_fixture):
     assert np.allclose(dml_qte_fixture['se'],
                        dml_qte_fixture['se_manual'],
                        rtol=1e-9, atol=1e-4)
+    assert np.allclose(dml_qte_fixture['se'],
+                       dml_qte_fixture['se_ext_smpls'],
+                       rtol=1e-9, atol=1e-4)
 
 
 @pytest.mark.ci
@@ -148,8 +174,8 @@ def test_doubleml_qte_exceptions():
     ml_g = RandomForestClassifier(n_estimators=20)
     ml_m = RandomForestClassifier(n_estimators=20)
 
-    msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). ' \
-          'External samples not implemented yet.'
+    msg = ('Sample splitting not specified. '
+           r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).')
     with pytest.raises(ValueError, match=msg):
         dml_obj = dml.DoubleMLQTE(obj_dml_data, ml_g, ml_m, draw_sample_splitting=False)
         _ = dml_obj.smpls
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 7ecadebd4..a5a671103 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -448,7 +448,7 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d
 
             if all(smpls_are_partitions):
                 n_rep = len(all_smpls)
-                n_folds = n_folds_each_smpl[0]
+                n_folds = int(n_folds_each_smpl[0])
                 smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True)
             else:
                 raise ValueError('Invalid partition provided. '